diff --git a/.claude/agents/compliance.md b/.claude/agents/compliance.md new file mode 100644 index 0000000..262021c --- /dev/null +++ b/.claude/agents/compliance.md @@ -0,0 +1,8 @@ +--- +name: compliance +model: claude-sonnet-4-5 +description: CLAUDE.md compliance specialist +--- + +Audit changed files against relevant CLAUDE.md guidance. +Return only JSON findings with concrete rule references. diff --git a/.claude/agents/quality.md b/.claude/agents/quality.md new file mode 100644 index 0000000..9b64144 --- /dev/null +++ b/.claude/agents/quality.md @@ -0,0 +1,8 @@ +--- +name: quality +model: claude-opus-4-6 +description: Code quality specialist for correctness and reliability +--- + +Find high-signal correctness, reliability, and performance issues. +Return only JSON findings. diff --git a/.claude/agents/security.md b/.claude/agents/security.md new file mode 100644 index 0000000..2295274 --- /dev/null +++ b/.claude/agents/security.md @@ -0,0 +1,8 @@ +--- +name: security +model: claude-opus-4-6 +description: Security specialist for exploitable vulnerabilities +--- + +Find exploitable vulnerabilities in changed code with concrete attack paths. +Return only JSON findings including exploit preconditions and trust boundary. diff --git a/.claude/agents/triage.md b/.claude/agents/triage.md new file mode 100644 index 0000000..69fa84c --- /dev/null +++ b/.claude/agents/triage.md @@ -0,0 +1,8 @@ +--- +name: triage +model: claude-haiku-4-5 +description: Fast PR triage for skip/continue decisions +--- + +Determine whether review can be skipped safely. +Return only JSON with `skip_review`, `reason`, and `risk_level`. diff --git a/.claude/agents/validator.md b/.claude/agents/validator.md new file mode 100644 index 0000000..1d69298 --- /dev/null +++ b/.claude/agents/validator.md @@ -0,0 +1,8 @@ +--- +name: validator +model: claude-sonnet-4-5 +description: Finding validation and deduplication specialist +--- + +Validate candidate findings with strict confidence and impact criteria. +Return only JSON decisions for keep/drop. diff --git a/.claude/commands/review.md b/.claude/commands/review.md index 50ae979..9a9e14f 100644 --- a/.claude/commands/review.md +++ b/.claude/commands/review.md @@ -40,6 +40,12 @@ To do this, follow these steps precisely: Agent 4: Opus security agent Look for security vulnerabilities in the introduced code. This includes injection, auth bypass, data exposure, unsafe deserialization, or other exploitable issues. Only look for issues that fall within the changed code. + Security evidence requirements for every reported issue: + - Include a concrete exploit or abuse path. + - Include attacker preconditions. + - Identify the impacted trust boundary or sensitive asset. + - Provide an actionable mitigation. + **CRITICAL: We only want HIGH SIGNAL issues.** Flag issues where: - The code will fail to compile or parse (syntax errors, type errors, missing imports, unresolved references) - The code will definitely produce wrong results regardless of inputs (clear logic errors) @@ -52,6 +58,7 @@ To do this, follow these steps precisely: - Subjective suggestions or improvements - Security issues that depend on speculative inputs or unverified assumptions - Denial of Service (DoS) or rate limiting issues without concrete exploitability + - Findings based only on diff snippets without validating surrounding repository context If you are not certain an issue is real, do not flag it. False positives erode trust and waste reviewer time. diff --git a/.gitignore b/.gitignore index 8b8dd8e..dd3b326 100644 --- a/.gitignore +++ b/.gitignore @@ -1,11 +1,22 @@ +# OS-generated files +.DS_Store +Thumbs.db + # Cache directories .cache/ +.pytest_cache/ # Python __pycache__/ *.py[cod] *$py.class *.pyc +.python-version +.mypy_cache/ +.ruff_cache/ +.coverage +.coverage.* +htmlcov/ # Output files *.csv @@ -21,4 +32,17 @@ env/ claudecode/claudecode-prompt.txt eval_results/ -.env \ No newline at end of file +.env +.env.* + +# Editor / IDE +.idea/ +.vscode/ +*.swp +*.swo + +# Node / Bun +node_modules/ + +# Logs +*.log diff --git a/README.md b/README.md index 2474766..59f8179 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Nutrient Code Reviewer -An AI-powered code review GitHub Action using Claude to analyze code changes. Uses a unified multi-agent approach for both code quality (correctness, reliability, performance, maintainability, testing) and security in a single pass. This action provides intelligent, context-aware review for pull requests using Anthropic's Claude Code tool for deep semantic analysis. +An AI-powered code review GitHub Action using Claude to analyze code changes. Uses a unified multi-agent, multi-phase approach for both code quality (correctness, reliability, performance, maintainability, testing) and security. This action provides intelligent, context-aware review for pull requests using Anthropic's Claude Code tool for deep semantic analysis. Based on the original work from [anthropics/claude-code-security-review](https://github.com/anthropics/claude-code-security-review). @@ -111,9 +111,13 @@ This action is not hardened against prompt injection attacks and should only be | `comment-pr` | Whether to comment on PRs with findings | `true` | No | | `upload-results` | Whether to upload results as artifacts | `true` | No | | `exclude-directories` | Comma-separated list of directories to exclude from scanning | None | No | -| `claude-model` | Claude [model name](https://docs.anthropic.com/en/docs/about-claude/models/overview#model-names) to use. Defaults to Opus 4.5. | `claude-opus-4-5-20251101` | No | +| `claude-model` | Claude [model name](https://docs.anthropic.com/en/docs/about-claude/models/overview#model-names) to use. Defaults to Opus 4.6. | `claude-opus-4-6` | No | +| `model-triage` | Model used for triage phase (skip/continue decision). | `claude-haiku-4-5` | No | +| `model-compliance` | Model used for CLAUDE.md compliance phase. | `claude-sonnet-4-5` | No | +| `model-quality` | Model used for code quality phase. | `claude-opus-4-6` | No | +| `model-security` | Model used for security phase. | `claude-opus-4-6` | No | +| `model-validation` | Model used for finding validation phase. | `claude-sonnet-4-5` | No | | `claudecode-timeout` | Timeout for ClaudeCode analysis in minutes | `20` | No | -| `run-every-commit` | Run ClaudeCode on every commit (skips cache check). Warning: May increase false positives on PRs with many commits. **Deprecated**: Use `trigger-on-commit` instead. | `false` | No | | `trigger-on-open` | Run review when PR is first opened | `true` | No | | `trigger-on-commit` | Run review on every new commit | `false` | No | | `trigger-on-review-request` | Run review when someone requests a review from the bot | `true` | No | @@ -123,10 +127,10 @@ This action is not hardened against prompt injection attacks and should only be | `false-positive-filtering-instructions` | Path to custom false positive filtering instructions text file | None | No | | `custom-review-instructions` | Path to custom code review instructions text file to append to the audit prompt | None | No | | `custom-security-scan-instructions` | Path to custom security scan instructions text file to append to the security section | None | No | -| `dismiss-stale-reviews` | Dismiss previous bot reviews when posting a new review (useful for follow-up commits) | `true` | No | | `skip-draft-prs` | Skip code review on draft pull requests | `true` | No | | `app-slug` | GitHub App slug for bot mention detection. If using `actions/create-github-app-token@v1.9.0+`, pass `${{ steps.app-token.outputs.app-slug }}`. Otherwise defaults to `github-actions`. | `github-actions` | No | | `require-label` | Only run review if this label is present. Leave empty to review all PRs. Add `labeled` to your workflow `pull_request` types to trigger on label addition. | None | No | +| `max-diff-lines` | Maximum inline diff lines included as prompt anchor; repository tool reads are still required in all cases. | `5000` | No | ### Action Outputs @@ -294,11 +298,12 @@ claudecode/ ### Workflow -1. **PR Analysis**: When a pull request is opened, Claude analyzes the diff to understand what changed -2. **Contextual Review**: Claude examines the code changes in context, understanding the purpose and potential impacts -3. **Finding Generation**: Issues are identified with detailed explanations, severity ratings, and remediation guidance -4. **False Positive Filtering**: Advanced filtering removes low-impact or false positive prone findings to reduce noise -5. **PR Comments**: Findings are posted as review comments on the specific lines of code +1. **Triage Phase**: A fast triage pass determines if review should proceed. +2. **Context Discovery**: Claude discovers relevant CLAUDE.md files, hotspots, and risky code paths. +3. **Specialist Review**: Dedicated compliance, quality, and security phases run with configurable models. +4. **Validation Phase**: Candidate findings are validated and deduplicated for high signal. +5. **False Positive Filtering**: Additional filtering removes low-impact noise. +6. **PR Comments**: Findings are posted as review comments on specific lines in the PR. ## Review Capabilities diff --git a/action.yml b/action.yml index ec846aa..241b8bb 100644 --- a/action.yml +++ b/action.yml @@ -29,15 +29,34 @@ inputs: default: '' claude-model: - description: 'Claude model to use for code review analysis (e.g., claude-sonnet-4-20250514)' + description: 'Claude model to use for code review analysis (e.g., claude-sonnet-4-5)' required: false default: '' - run-every-commit: - description: 'DEPRECATED: Use trigger-on-commit instead. Run ClaudeCode on every commit (skips cache check). Warning: This may lead to more false positives on PRs with many commits as the AI analyzes the same code multiple times.' + model-triage: + description: 'Model for triage phase' required: false - default: 'false' - deprecationMessage: 'run-every-commit is deprecated. Use trigger-on-commit instead for more granular control over when reviews run.' + default: 'claude-haiku-4-5' + + model-compliance: + description: 'Model for CLAUDE.md compliance phase' + required: false + default: 'claude-sonnet-4-5' + + model-quality: + description: 'Model for code quality phase' + required: false + default: 'claude-opus-4-6' + + model-security: + description: 'Model for security phase' + required: false + default: 'claude-opus-4-6' + + model-validation: + description: 'Model for validation phase' + required: false + default: 'claude-sonnet-4-5' false-positive-filtering-instructions: description: 'Path to custom false positive filtering instructions text file' @@ -249,7 +268,6 @@ runs: GITHUB_EVENT_NAME: ${{ github.event_name }} PR_NUMBER: ${{ github.event.pull_request.number || steps.pr-info.outputs.pr_number }} GITHUB_SHA: ${{ github.event.pull_request.head.sha || steps.pr-info.outputs.pr_sha || github.sha }} - RUN_EVERY_COMMIT: ${{ inputs.run-every-commit }} TRIGGER_ON_OPEN: ${{ inputs.trigger-on-open }} TRIGGER_ON_COMMIT: ${{ inputs.trigger-on-commit }} TRIGGER_ON_REVIEW_REQUEST: ${{ inputs.trigger-on-review-request }} @@ -351,6 +369,11 @@ runs: CUSTOM_REVIEW_INSTRUCTIONS: ${{ inputs.custom-review-instructions }} CUSTOM_SECURITY_SCAN_INSTRUCTIONS: ${{ inputs.custom-security-scan-instructions }} CLAUDE_MODEL: ${{ inputs.claude-model }} + MODEL_TRIAGE: ${{ inputs.model-triage }} + MODEL_COMPLIANCE: ${{ inputs.model-compliance }} + MODEL_QUALITY: ${{ inputs.model-quality }} + MODEL_SECURITY: ${{ inputs.model-security }} + MODEL_VALIDATION: ${{ inputs.model-validation }} CLAUDECODE_TIMEOUT: ${{ inputs.claudecode-timeout }} MAX_DIFF_LINES: ${{ inputs.max-diff-lines }} ACTION_PATH: ${{ github.action_path }} diff --git a/claudecode/__init__.py b/claudecode/__init__.py index d0cbfdc..07611ed 100644 --- a/claudecode/__init__.py +++ b/claudecode/__init__.py @@ -12,11 +12,16 @@ from claudecode.github_action_audit import ( GitHubActionClient, SimpleClaudeRunner, + get_review_model_config, main ) +from claudecode.review_orchestrator import ReviewModelConfig, ReviewOrchestrator __all__ = [ "GitHubActionClient", "SimpleClaudeRunner", + "ReviewModelConfig", + "ReviewOrchestrator", + "get_review_model_config", "main" ] diff --git a/claudecode/claude_api_client.py b/claudecode/claude_api_client.py index 5504b93..849f733 100644 --- a/claudecode/claude_api_client.py +++ b/claudecode/claude_api_client.py @@ -59,7 +59,7 @@ def validate_api_access(self) -> Tuple[bool, str]: try: # Simple test call to verify API access self.client.messages.create( - model="claude-3-5-haiku-20241022", + model="claude-haiku-4-5", max_tokens=10, messages=[{"role": "user", "content": "Hello"}], timeout=10 diff --git a/claudecode/constants.py b/claudecode/constants.py index bbf7988..6c81257 100644 --- a/claudecode/constants.py +++ b/claudecode/constants.py @@ -5,7 +5,7 @@ import os # API Configuration -DEFAULT_CLAUDE_MODEL = os.environ.get('CLAUDE_MODEL') or 'claude-opus-4-5-20251101' +DEFAULT_CLAUDE_MODEL = os.environ.get('CLAUDE_MODEL') or 'claude-opus-4-6' DEFAULT_TIMEOUT_SECONDS = 180 # 3 minutes DEFAULT_MAX_RETRIES = 3 RATE_LIMIT_BACKOFF_MAX = 30 # Maximum backoff time for rate limits @@ -20,4 +20,3 @@ # Subprocess Configuration SUBPROCESS_TIMEOUT = 1200 # 20 minutes for Claude Code execution - diff --git a/claudecode/findings_merge.py b/claudecode/findings_merge.py new file mode 100644 index 0000000..aae73cf --- /dev/null +++ b/claudecode/findings_merge.py @@ -0,0 +1,61 @@ +"""Utilities for merging and deduplicating findings from multiple phases.""" + +from typing import Any, Dict, List, Tuple + + +def _normalize_text(value: Any) -> str: + return str(value or "").strip().lower() + + +def _finding_key(finding: Dict[str, Any]) -> Tuple[str, int, str, str]: + file_path = _normalize_text(finding.get("file")) + line = finding.get("line") + try: + line_no = int(line) + except (TypeError, ValueError): + line_no = 1 + category = _normalize_text(finding.get("category")) + title = _normalize_text(finding.get("title")) + return file_path, line_no, category, title + + +def _severity_rank(value: Any) -> int: + sev = _normalize_text(value).upper() + if sev == "HIGH": + return 3 + if sev == "MEDIUM": + return 2 + if sev == "LOW": + return 1 + return 0 + + +def _confidence_value(value: Any) -> float: + try: + return float(value) + except (TypeError, ValueError): + return 0.0 + + +def merge_findings(findings: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Merge duplicate findings and keep the strongest candidate.""" + merged: Dict[Tuple[str, int, str, str], Dict[str, Any]] = {} + + for finding in findings: + if not isinstance(finding, dict): + continue + + key = _finding_key(finding) + existing = merged.get(key) + + if existing is None: + merged[key] = finding + continue + + incoming_score = (_severity_rank(finding.get("severity")), _confidence_value(finding.get("confidence"))) + existing_score = (_severity_rank(existing.get("severity")), _confidence_value(existing.get("confidence"))) + + if incoming_score > existing_score: + merged[key] = finding + + return list(merged.values()) diff --git a/claudecode/format_pr_comments.py b/claudecode/format_pr_comments.py deleted file mode 100644 index eb60fe3..0000000 --- a/claudecode/format_pr_comments.py +++ /dev/null @@ -1,314 +0,0 @@ -"""Format PR comments for review context in prompts.""" - -from typing import Dict, Any, List -from datetime import datetime - -from claudecode.logger import get_logger - -logger = get_logger(__name__) - -# Maximum characters for review context to avoid prompt bloat -MAX_CONTEXT_CHARS = 15000 -# Maximum replies to include per thread -MAX_REPLIES_PER_THREAD = 5 -# Bot comment marker pattern -BOT_COMMENT_MARKER = "🤖 **Code Review Finding:" - -# GitHub reaction content to emoji mapping -REACTION_EMOJI_MAP = { - '+1': '👍', - '-1': '👎', - 'laugh': '😄', - 'confused': '😕', - 'heart': '❤️', - 'hooray': '🎉', - 'rocket': '🚀', - 'eyes': '👀', -} - - -def is_bot_comment(comment: Dict[str, Any]) -> bool: - """Check if a comment was posted by this bot. - - Args: - comment: Comment dictionary from GitHub API. - - Returns: - True if this is a bot review comment. - """ - body = comment.get('body', '') - user = comment.get('user', {}) - - return BOT_COMMENT_MARKER in body and user.get('type') == 'Bot' - - -def format_pr_comments_for_prompt( - bot_comment_threads: List[Dict[str, Any]], -) -> str: - """Format bot comment threads as review context for the prompt. - - Takes pre-built thread structures and formats them for Claude to consider - during re-review. - - Args: - bot_comment_threads: List of thread dicts, each containing: - - 'bot_comment': The original bot finding comment - - 'replies': List of user reply comments - - 'reactions': Dict of reaction counts (e.g., {'+1': 2, '-1': 1}) - - Returns: - Formatted string with previous review threads, or empty string if none. - """ - if not bot_comment_threads: - logger.info("No bot comment threads to format") - return "" - - # Apply reply truncation - threads = _truncate_replies(bot_comment_threads) - - logger.info(f"Formatting {len(threads)} bot comment thread(s)") - return _format_threads_for_prompt(threads) - - -def _truncate_replies(threads: List[Dict[str, Any]]) -> List[Dict[str, Any]]: - """Truncate long reply threads to keep prompt size manageable. - - Args: - threads: List of thread dictionaries. - - Returns: - Threads with replies truncated to MAX_REPLIES_PER_THREAD. - """ - result = [] - for thread in threads: - thread_copy = dict(thread) - replies = thread_copy.get('replies', []) - - if len(replies) > MAX_REPLIES_PER_THREAD: - truncated_count = len(replies) - MAX_REPLIES_PER_THREAD - thread_copy['replies'] = replies[-MAX_REPLIES_PER_THREAD:] - thread_copy['truncated_replies'] = truncated_count - - result.append(thread_copy) - - return result - - -def _parse_bot_comment(body: str) -> Dict[str, str]: - """Parse structured data from bot comment body. - - Args: - body: The comment body text. - - Returns: - Dictionary with extracted fields (title, severity, category, etc.) - """ - result = { - 'title': '', - 'severity': '', - 'category': '', - 'impact': '', - 'recommendation': '' - } - - # Extract title from "🤖 **Code Review Finding: {title}**" - if BOT_COMMENT_MARKER in body: - start = body.find(BOT_COMMENT_MARKER) + len(BOT_COMMENT_MARKER) - end = body.find('**', start) - if end > start: - result['title'] = body[start:end].strip() - - # Extract severity - if '**Severity:**' in body: - start = body.find('**Severity:**') + len('**Severity:**') - end = body.find('\n', start) - if end > start: - result['severity'] = body[start:end].strip() - - # Extract category - if '**Category:**' in body: - start = body.find('**Category:**') + len('**Category:**') - end = body.find('\n', start) - if end > start: - result['category'] = body[start:end].strip() - - # Extract impact - if '**Impact:**' in body: - start = body.find('**Impact:**') + len('**Impact:**') - end = body.find('\n\n', start) - if end == -1: - end = body.find('**Recommendation:**', start) - if end > start: - result['impact'] = body[start:end].strip() - - # Extract recommendation - if '**Recommendation:**' in body: - start = body.find('**Recommendation:**') + len('**Recommendation:**') - end = body.find('\n\n', start) - if end == -1: - end = body.find('```', start) # Code suggestion block - if end == -1: - end = len(body) - if end > start: - result['recommendation'] = body[start:end].strip() - - return result - - -def _format_timestamp(iso_timestamp: str) -> str: - """Format ISO timestamp to readable format. - - Args: - iso_timestamp: ISO 8601 timestamp string. - - Returns: - Human-readable timestamp. - """ - try: - dt = datetime.fromisoformat(iso_timestamp.replace('Z', '+00:00')) - return dt.strftime('%Y-%m-%d %H:%M UTC') - except (ValueError, AttributeError): - return iso_timestamp - - -def _format_threads_for_prompt(threads: List[Dict[str, Any]]) -> str: - """Format threads as readable text for Claude's prompt. - - Args: - threads: List of thread dictionaries. - - Returns: - Formatted string for inclusion in prompt. - """ - if not threads: - return "" - - lines = [ - "", - "═" * 65, - "PREVIOUS REVIEW CONTEXT", - "═" * 65, - "", - "The following findings were raised in previous reviews of this PR.", - "Review user responses to determine if issues should be re-raised.", - "" - ] - - for i, thread in enumerate(threads, 1): - bot_comment = thread['bot_comment'] - replies = thread['replies'] - reactions = thread.get('reactions', {}) - - # Get file and line info - file_path = bot_comment.get('path', 'unknown') - line = bot_comment.get('line') or bot_comment.get('original_line', '?') - - # Parse bot comment structure - parsed = _parse_bot_comment(bot_comment.get('body', '')) - - # Format thread header - lines.append(f"THREAD {i} - {file_path}:{line}") - lines.append("─" * 65) - - # Bot finding - timestamp = _format_timestamp(bot_comment.get('created_at', '')) - lines.append(f"Bot Finding ({timestamp})") - - if parsed['severity']: - lines.append(f" Severity: {parsed['severity']}") - if parsed['category']: - lines.append(f" Category: {parsed['category']}") - if parsed['title']: - lines.append(f" Title: {parsed['title']}") - if parsed['impact']: - lines.append(f" Impact: {parsed['impact'][:500]}...") # Truncate long impacts - if parsed['recommendation']: - lines.append(f" Recommendation: {parsed['recommendation'][:500]}...") - - # Add user reactions (excluding bot's own reactions) - if reactions: - reaction_parts = [] - for reaction, count in reactions.items(): - if count > 0: - emoji = REACTION_EMOJI_MAP.get(reaction, reaction) - reaction_parts.append(f"{emoji} {count}") - - if reaction_parts: - lines.append(f" User Reactions: {', '.join(reaction_parts)}") - - lines.append("") - - # Truncation notice - if thread.get('truncated_replies'): - lines.append(f" ({thread['truncated_replies']} earlier replies omitted)") - lines.append("") - - # User replies - for reply in replies: - user = reply.get('user', {}).get('login', 'unknown') - reply_timestamp = _format_timestamp(reply.get('created_at', '')) - reply_body = reply.get('body', '').strip() - - # Truncate very long replies - if len(reply_body) > 1000: - reply_body = reply_body[:1000] + "... (truncated)" - - lines.append(f"User Reply ({user}, {reply_timestamp}):") - # Indent reply text - for reply_line in reply_body.split('\n'): - lines.append(f" {reply_line}") - lines.append("") - - lines.append("─" * 65) - lines.append("") - - # Add instructions for re-review - lines.extend([ - "═" * 65, - "INSTRUCTIONS FOR RE-REVIEW", - "═" * 65, - "", - "When reviewing this PR with the above context:", - "", - "1. CHECK IF ISSUES WERE ADDRESSED: Compare previous findings against", - " the current diff. If code was changed to fix the issue, do NOT", - " re-raise it.", - "", - "2. EVALUATE USER RESPONSES: Read user replies carefully. Valid", - " dismissals include:", - " - Demonstrating the issue is a false positive with evidence", - " - Showing existing mitigations the bot missed", - " - Providing strong technical justification", - "", - " Invalid dismissals include:", - " - \"We'll fix this later\" (without code change)", - " - Misunderstanding the vulnerability/issue", - " - Ignoring the issue without explanation", - "", - "3. CONSIDER USER REACTIONS: Reactions provide additional signal:", - " - 👎 (thumbs down) suggests users found the finding unhelpful", - " - 👍 (thumbs up) suggests users found the finding valuable", - " - High 👎 count with no reply may indicate obvious false positive", - " - Use reactions as one input, but prioritize reply content", - "", - "4. RE-RAISE WHEN APPROPRIATE: If an issue was invalidly dismissed", - " or remains unaddressed, re-raise it with:", - " - Reference to the previous discussion", - " - Response to the user's dismissal reasoning", - " - Updated title: \"[Issue Title] (previously raised)\"", - "", - "5. DO NOT REPEAT RESOLVED ISSUES: If code was changed to address", - " a finding, do not mention it unless the fix is incomplete.", - "", - "═" * 65, - "" - ]) - - result = '\n'.join(lines) - - # Truncate if too long - if len(result) > MAX_CONTEXT_CHARS: - logger.warning(f"Review context truncated from {len(result)} to {MAX_CONTEXT_CHARS} chars") - result = result[:MAX_CONTEXT_CHARS] + "\n\n(Review context truncated due to length)\n" - - return result \ No newline at end of file diff --git a/claudecode/github_action_audit.py b/claudecode/github_action_audit.py index dbf6766..ca80af4 100644 --- a/claudecode/github_action_audit.py +++ b/claudecode/github_action_audit.py @@ -15,10 +15,9 @@ import time # Import existing components we can reuse -from claudecode.prompts import get_unified_review_prompt from claudecode.findings_filter import FindingsFilter from claudecode.json_parser import parse_json_with_fallbacks -from claudecode.format_pr_comments import format_pr_comments_for_prompt, is_bot_comment +from claudecode.review_orchestrator import ReviewModelConfig, ReviewOrchestrator from claudecode.constants import ( EXIT_CONFIGURATION_ERROR, DEFAULT_CLAUDE_MODEL, @@ -27,7 +26,6 @@ SUBPROCESS_TIMEOUT ) from claudecode.logger import get_logger -from claudecode.review_schema import REVIEW_OUTPUT_SCHEMA logger = get_logger(__name__) @@ -35,10 +33,6 @@ class ConfigurationError(ValueError): """Raised when configuration is invalid or missing.""" pass -class AuditError(ValueError): - """Raised when code review operations fail.""" - pass - class GitHubActionClient: """Simplified GitHub API client for GitHub Actions environment.""" @@ -186,98 +180,23 @@ def get_pr_data(self, repo_name: str, pr_number: int) -> Dict[str, Any]: def get_pr_diff(self, repo_name: str, pr_number: int) -> str: """Get complete PR diff in unified format. - + Args: repo_name: Repository name in format "owner/repo" pr_number: Pull request number - + Returns: Complete PR diff in unified format """ url = f"https://api.github.com/repos/{repo_name}/pulls/{pr_number}" headers = dict(self.headers) headers['Accept'] = 'application/vnd.github.diff' - + response = requests.get(url, headers=headers) response.raise_for_status() - + return self._filter_generated_files(response.text) - def get_pr_comments(self, repo_name: str, pr_number: int) -> List[Dict[str, Any]]: - """Get all review comments for a PR with pagination. - - Args: - repo_name: Repository name in format "owner/repo" - pr_number: Pull request number - - Returns: - List of comment dictionaries from GitHub API - """ - all_comments = [] - page = 1 - per_page = 100 - - while True: - url = f"https://api.github.com/repos/{repo_name}/pulls/{pr_number}/comments" - params = {'per_page': per_page, 'page': page} - - try: - response = requests.get(url, headers=self.headers, params=params) - response.raise_for_status() - comments = response.json() - - if not comments: - break - - all_comments.extend(comments) - - # Check if there are more pages - if len(comments) < per_page: - break - - page += 1 - - except requests.RequestException as e: - logger.warning(f"Failed to fetch comments page {page}: {e}") - break - - return all_comments - - def get_comment_reactions(self, repo_name: str, comment_id: int) -> Dict[str, int]: - """Get reactions for a specific comment, excluding bot reactions. - - Args: - repo_name: Repository name in format "owner/repo" - comment_id: The comment ID to fetch reactions for - - Returns: - Dictionary with reaction counts (e.g., {'+1': 3, '-1': 1}) - """ - url = f"https://api.github.com/repos/{repo_name}/pulls/comments/{comment_id}/reactions" - - try: - response = requests.get(url, headers=self.headers) - response.raise_for_status() - reactions = response.json() - - # Count reactions, excluding those from bots - counts = {} - for reaction in reactions: - user = reaction.get('user', {}) - # Skip bot reactions (the bot adds its own 👍👎 as seeds) - if user.get('type') == 'Bot': - continue - - content = reaction.get('content', '') - if content: - counts[content] = counts.get(content, 0) + 1 - - return counts - - except requests.RequestException as e: - logger.debug(f"Failed to fetch reactions for comment {comment_id}: {e}") - return {} - def _is_excluded(self, filepath: str) -> bool: """Check if a file should be excluded based on directory or file patterns.""" import fnmatch @@ -310,6 +229,10 @@ def _is_excluded(self, filepath: str) -> bool: return True return False + + def is_excluded(self, filepath: str) -> bool: + """Public wrapper for exclusion checks.""" + return self._is_excluded(filepath) def _filter_generated_files(self, diff_text: str) -> str: """Filter out generated files and excluded directories from diff content.""" @@ -355,114 +278,82 @@ def __init__(self, timeout_minutes: Optional[int] = None): else: self.timeout_seconds = SUBPROCESS_TIMEOUT - def run_code_review(self, repo_dir: Path, prompt: str) -> Tuple[bool, str, Dict[str, Any]]: - """Run Claude Code review. - - Args: - repo_dir: Path to repository directory - prompt: Code review prompt - - Returns: - Tuple of (success, error_message, parsed_results) - """ + def run_prompt( + self, + repo_dir: Path, + prompt: str, + model: Optional[str] = None, + json_schema: Optional[Dict[str, Any]] = None, + ) -> Tuple[bool, str, Any]: + """Run a single Claude prompt and return parsed JSON payload or text.""" if not repo_dir.exists(): return False, f"Repository directory does not exist: {repo_dir}", {} - - # Check prompt size + prompt_size = len(prompt.encode('utf-8')) if prompt_size > 1024 * 1024: # 1MB print(f"[Warning] Large prompt size: {prompt_size / 1024 / 1024:.2f}MB", file=sys.stderr) - + + model_name = model or DEFAULT_CLAUDE_MODEL try: - # Construct Claude Code command - # Use stdin for prompt to avoid "argument list too long" error cmd = [ 'claude', '--output-format', 'json', - '--model', DEFAULT_CLAUDE_MODEL, + '--model', model_name, '--disallowed-tools', 'Bash(ps:*)', - '--json-schema', json.dumps(REVIEW_OUTPUT_SCHEMA) ] - - # Run Claude Code with retry logic + if json_schema: + cmd.extend(['--json-schema', json.dumps(json_schema)]) + NUM_RETRIES = 3 for attempt in range(NUM_RETRIES): result = subprocess.run( cmd, - input=prompt, # Pass prompt via stdin + input=prompt, cwd=repo_dir, capture_output=True, text=True, timeout=self.timeout_seconds ) - # Parse JSON output (even if returncode != 0, to detect specific errors) - success, parsed_result = parse_json_with_fallbacks(result.stdout, "Claude Code output") + if result.returncode != 0: + if attempt == NUM_RETRIES - 1: + error_details = f"Claude Code execution failed with return code {result.returncode}\n" + error_details += f"Stderr: {result.stderr}\n" + error_details += f"Stdout: {result.stdout[:500]}..." + return False, error_details, {} + time.sleep(5 * (attempt + 1)) + continue + success, parsed_result = parse_json_with_fallbacks(result.stdout, "Claude Code output") if success: - # Check for "Prompt is too long" error that should trigger fallback to agentic mode - if (isinstance(parsed_result, dict) and - parsed_result.get('type') == 'result' and + if (isinstance(parsed_result, dict) and + parsed_result.get('type') == 'result' and parsed_result.get('subtype') == 'success' and parsed_result.get('is_error') and parsed_result.get('result') == 'Prompt is too long'): return False, "PROMPT_TOO_LONG", {} - # Check for error_during_execution that should trigger retry - if (isinstance(parsed_result, dict) and - parsed_result.get('type') == 'result' and + if (isinstance(parsed_result, dict) and + parsed_result.get('type') == 'result' and parsed_result.get('subtype') == 'error_during_execution' and - attempt == 0): - continue # Retry - - # If returncode is 0, extract review findings - if result.returncode == 0: - parsed_results = self._extract_review_findings(parsed_result) - return True, "", parsed_results + attempt < NUM_RETRIES - 1): + continue - # Handle non-zero return codes after parsing - if result.returncode != 0: - if attempt == NUM_RETRIES - 1: - error_details = f"Claude Code execution failed with return code {result.returncode}\n" - error_details += f"Stderr: {result.stderr}\n" - error_details += f"Stdout: {result.stdout[:500]}..." # First 500 chars - return False, error_details, {} - else: - time.sleep(5*attempt) - # Note: We don't do exponential backoff here to keep the runtime reasonable - continue # Retry + if isinstance(parsed_result, dict) and 'result' in parsed_result and isinstance(parsed_result['result'], str): + nested_success, nested = parse_json_with_fallbacks(parsed_result['result'], "Claude result text") + if nested_success: + return True, "", nested + return True, "", parsed_result - # Parse failed if attempt == 0: - continue # Retry once - else: - return False, "Failed to parse Claude output", {} - + continue + return False, "Failed to parse Claude output", {} + return False, "Unexpected error in retry logic", {} - except subprocess.TimeoutExpired: return False, f"Claude Code execution timed out after {self.timeout_seconds // 60} minutes", {} except Exception as e: return False, f"Claude Code execution error: {str(e)}", {} - - def _extract_review_findings(self, claude_output: Any) -> Dict[str, Any]: - """Extract review findings and PR summary from Claude's JSON response.""" - if isinstance(claude_output, dict): - # Only accept Claude Code wrapper with result field - # Direct format without wrapper is not supported - if 'result' in claude_output: - result_text = claude_output['result'] - if isinstance(result_text, str): - # Try to extract JSON from the result text - success, result_json = parse_json_with_fallbacks(result_text, "Claude result text") - if success and result_json and 'findings' in result_json and 'pr_summary' in result_json: - return result_json - - # Return empty structure if no findings found - return { - 'findings': [], - 'pr_summary': {} - } def validate_claude_available(self) -> Tuple[bool, str]: """Validate that Claude Code is available.""" @@ -559,17 +450,14 @@ def initialize_findings_filter(custom_filtering_instructions: Optional[str] = No ConfigurationError: If filter initialization fails """ try: - # Check if we should use heuristic (pattern-based) filtering - use_heuristic_filtering = os.environ.get('ENABLE_HEURISTIC_FILTERING', 'true').lower() == 'true' - # Check if we should use Claude API filtering use_claude_filtering = os.environ.get('ENABLE_CLAUDE_FILTERING', 'false').lower() == 'true' api_key = os.environ.get('ANTHROPIC_API_KEY') - + if use_claude_filtering and api_key: # Use full filtering with Claude API return FindingsFilter( - use_hard_exclusions=use_heuristic_filtering, + use_hard_exclusions=True, use_claude_filtering=True, api_key=api_key, custom_filtering_instructions=custom_filtering_instructions @@ -577,36 +465,27 @@ def initialize_findings_filter(custom_filtering_instructions: Optional[str] = No else: # Fallback to filtering with hard rules only return FindingsFilter( - use_hard_exclusions=use_heuristic_filtering, + use_hard_exclusions=True, use_claude_filtering=False ) except Exception as e: raise ConfigurationError(f'Failed to initialize findings filter: {str(e)}') +def get_review_model_config() -> ReviewModelConfig: + """Resolve per-phase model configuration from environment.""" + return ReviewModelConfig.from_env(os.environ, DEFAULT_CLAUDE_MODEL) + + +def get_max_diff_lines() -> int: + """Return bounded inline diff budget in lines.""" + max_diff_lines_str = os.environ.get('MAX_DIFF_LINES', '5000') + try: + parsed = int(max_diff_lines_str) + except ValueError: + parsed = 5000 + return max(0, parsed) -def run_code_review(claude_runner: SimpleClaudeRunner, prompt: str) -> Dict[str, Any]: - """Run the code review with Claude Code. - - Args: - claude_runner: Claude runner instance - prompt: The review prompt - - Returns: - Review results dictionary - - Raises: - AuditError: If the review fails - """ - # Get repo directory from environment or use current directory - repo_path = os.environ.get('REPO_PATH') - repo_dir = Path(repo_path) if repo_path else Path.cwd() - success, error_msg, results = claude_runner.run_code_review(repo_dir, prompt) - - if not success: - raise AuditError(f'Code review failed: {error_msg}') - - return results def apply_findings_filter(findings_filter, original_findings: List[Dict[str, Any]], @@ -670,7 +549,7 @@ def _is_finding_in_excluded_directory(finding: Dict[str, Any], github_client: Gi if not file_path: return False - return github_client._is_excluded(file_path) + return github_client.is_excluded(file_path) def main(): @@ -744,147 +623,52 @@ def main(): print(json.dumps({'error': f'Failed to fetch PR data: {str(e)}'})) sys.exit(EXIT_GENERAL_ERROR) - # Fetch PR comments and build review context - review_context = None - try: - pr_comments = github_client.get_pr_comments(repo_name, pr_number) - if pr_comments: - # Build threads: find bot comments, their replies, and reactions - bot_comment_threads = [] - for comment in pr_comments: - if is_bot_comment(comment): - # This is a bot comment (thread root) - reactions = github_client.get_comment_reactions(repo_name, comment['id']) - - # Find replies to this comment - replies = [ - c for c in pr_comments - if c.get('in_reply_to_id') == comment['id'] - ] - # Sort replies by creation time - replies.sort(key=lambda c: c.get('created_at', '')) - - bot_comment_threads.append({ - 'bot_comment': comment, - 'replies': replies, - 'reactions': reactions, - }) - - # Sort threads by bot comment creation time (oldest first) - bot_comment_threads.sort(key=lambda t: t['bot_comment'].get('created_at', '')) - - if bot_comment_threads: - review_context = format_pr_comments_for_prompt(bot_comment_threads) - if review_context: - logger.info(f"Fetched previous review context ({len(review_context)} chars)") - except Exception as e: - logger.warning(f"Failed to fetch review context (continuing without it): {e}") - review_context = None - - # Determine whether to embed diff or use agentic file reading - max_diff_lines_str = os.environ.get('MAX_DIFF_LINES', '5000') - try: - max_diff_lines = int(max_diff_lines_str) - except ValueError: - max_diff_lines = 5000 - + max_diff_lines = get_max_diff_lines() diff_line_count = len(pr_diff.splitlines()) - use_agentic_mode = max_diff_lines == 0 or diff_line_count > max_diff_lines - - if use_agentic_mode: - print(f"[Info] Using agentic file reading mode (diff has {diff_line_count} lines, threshold: {max_diff_lines})", file=sys.stderr) + if max_diff_lines and diff_line_count > max_diff_lines: + print(f"[Info] Hybrid mode with truncated inline diff ({max_diff_lines}/{diff_line_count} lines)", file=sys.stderr) else: - print(f"[Debug] Embedding diff in prompt ({diff_line_count} lines)", file=sys.stderr) + print(f"[Info] Hybrid mode with full inline diff ({diff_line_count} lines)", file=sys.stderr) # Get repo directory from environment or use current directory repo_path = os.environ.get('REPO_PATH') repo_dir = Path(repo_path) if repo_path else Path.cwd() + model_config = get_review_model_config() + orchestrator = ReviewOrchestrator( + claude_runner=claude_runner, + findings_filter=findings_filter, + github_client=github_client, + model_config=model_config, + max_diff_lines=max_diff_lines, + ) - def run_review(include_diff: bool): - prompt_text = get_unified_review_prompt( - pr_data, - pr_diff if include_diff else None, - include_diff=include_diff, - custom_review_instructions=custom_review_instructions, - custom_security_instructions=custom_security_instructions, - review_context=review_context, - ) - return claude_runner.run_code_review(repo_dir, prompt_text), len(prompt_text) - - all_findings = [] - pr_summary_from_review = {} - - try: - (success, error_msg, review_results), prompt_len = run_review(include_diff=not use_agentic_mode) - - # Fallback to agentic mode if prompt still too long - if not success and error_msg == "PROMPT_TOO_LONG": - print(f"[Info] Prompt too long ({prompt_len} chars), falling back to agentic mode", file=sys.stderr) - (success, error_msg, review_results), prompt_len = run_review(include_diff=False) - - if not success: - raise AuditError(f'Code review failed: {error_msg}') - - pr_summary_from_review = review_results.get('pr_summary', {}) - for finding in review_results.get('findings', []): - if isinstance(finding, dict): - # Set review_type based on category - category = finding.get('category', '').lower() - if category == 'security': - finding.setdefault('review_type', 'security') - else: - finding.setdefault('review_type', 'general') - all_findings.append(finding) - - except AuditError as e: - print(json.dumps({'error': f'Code review failed: {str(e)}'})) + success, review_results, review_error = orchestrator.run( + repo_dir=repo_dir, + pr_data=pr_data, + pr_diff=pr_diff, + custom_review_instructions=custom_review_instructions, + custom_security_instructions=custom_security_instructions, + ) + if not success: + print(json.dumps({'error': f'Code review failed: {review_error}'})) sys.exit(EXIT_GENERAL_ERROR) - # Filter findings to reduce false positives - original_findings = all_findings - - # Prepare PR context for better filtering - pr_context = { - 'repo_name': repo_name, - 'pr_number': pr_number, - 'title': pr_data.get('title', ''), - 'description': pr_data.get('body', '') - } - - # Apply findings filter (including final directory exclusion) - kept_findings, excluded_findings, analysis_summary = apply_findings_filter( - findings_filter, original_findings, pr_context, github_client - ) - - # Prepare output summary - def severity_counts(findings_list): - high = len([f for f in findings_list if isinstance(f, dict) and f.get('severity', '').upper() == 'HIGH']) - medium = len([f for f in findings_list if isinstance(f, dict) and f.get('severity', '').upper() == 'MEDIUM']) - low = len([f for f in findings_list if isinstance(f, dict) and f.get('severity', '').upper() == 'LOW']) - return high, medium, low - - high_count, medium_count, low_count = severity_counts(kept_findings) - - # Calculate files_reviewed from pr_summary.file_changes - files_reviewed = 0 - if isinstance(pr_summary_from_review, dict): - file_changes = pr_summary_from_review.get('file_changes', []) - if isinstance(file_changes, list): - # Count unique files from all 'files' arrays - all_files = set() - for entry in file_changes: - if isinstance(entry, dict): - files_list = entry.get('files', []) - if isinstance(files_list, list): - all_files.update(files_list) - files_reviewed = len(all_files) + findings = review_results.get('findings', []) + analysis_summary = review_results.get('analysis_summary', {}) + high_count = len([f for f in findings if isinstance(f, dict) and f.get('severity', '').upper() == 'HIGH']) + medium_count = len([f for f in findings if isinstance(f, dict) and f.get('severity', '').upper() == 'MEDIUM']) + low_count = len([f for f in findings if isinstance(f, dict) and f.get('severity', '').upper() == 'LOW']) + files_reviewed = analysis_summary.get('files_reviewed', pr_data.get('changed_files', 0)) + try: + files_reviewed = int(files_reviewed) + except (TypeError, ValueError): + files_reviewed = 0 # Prepare output output = { 'pr_number': pr_number, 'repo': repo_name, - 'pr_summary': pr_summary_from_review, - 'findings': kept_findings, + 'findings': findings, 'analysis_summary': { 'files_reviewed': files_reviewed, 'high_severity': high_count, @@ -892,20 +676,18 @@ def severity_counts(findings_list): 'low_severity': low_count, 'review_completed': True }, - 'filtering_summary': { - 'total_original_findings': len(original_findings), - 'excluded_findings': len(excluded_findings), - 'kept_findings': len(kept_findings), - 'filter_analysis': analysis_summary, - 'excluded_findings_details': excluded_findings # Include full details of what was filtered - } + 'filtering_summary': review_results.get('filtering_summary', { + 'total_original_findings': len(findings), + 'excluded_findings': 0, + 'kept_findings': len(findings), + }) } # Output JSON to stdout print(json.dumps(output, indent=2)) # Exit with appropriate code - high_severity_count = len([f for f in kept_findings if f.get('severity', '').upper() == 'HIGH']) + high_severity_count = len([f for f in findings if isinstance(f, dict) and f.get('severity', '').upper() == 'HIGH']) sys.exit(EXIT_GENERAL_ERROR if high_severity_count > 0 else EXIT_SUCCESS) except Exception as e: diff --git a/claudecode/prompts.py b/claudecode/prompts.py index 6786858..7684710 100644 --- a/claudecode/prompts.py +++ b/claudecode/prompts.py @@ -1,268 +1,275 @@ -"""Code review prompt templates.""" +"""Prompt templates for multi-phase PR review orchestration.""" +from typing import Any, Dict, List, Optional -def _format_files_changed(pr_data): +COMPLIANCE_EXTRA_FIELDS = ',\n "rule_reference": "path/to/CLAUDE.md#section"' +SECURITY_EXTRA_FIELDS = ( + ',\n "exploit_preconditions": "...",\n "trust_boundary": "...",\n' + ' "cwe": "optional CWE-###"' +) + + +def _format_files_changed(pr_data: Dict[str, Any]) -> str: """Format changed files for prompt context.""" - return "\n".join([f"- {f['filename']}" for f in pr_data['files']]) + files = pr_data.get("files", []) + return "\n".join([f"- {f.get('filename', 'unknown')}" for f in files]) + + +def _build_hybrid_diff_section(pr_diff: str, max_lines: int) -> str: + """Build a bounded inline diff section while requiring tool-based context reads.""" + if not pr_diff: + return "\nNo inline diff available. Use repository tools to inspect changed files.\n" + if max_lines == 0: + return ( + "\nInline diff intentionally omitted (max-diff-lines=0). " + "Use repository tools to inspect changed files and context.\n" + ) + + lines = pr_diff.splitlines() + if max_lines > 0 and len(lines) > max_lines: + shown = "\n".join(lines[:max_lines]) + truncated_note = ( + f"\n[Diff truncated to {max_lines} lines out of {len(lines)} total lines. " + "You MUST use repository tools to inspect full context and missing hunks.]" + ) + return f"\nINLINE DIFF ANCHOR (TRUNCATED):\n```diff\n{shown}\n```{truncated_note}\n" + + return ( + "\nINLINE DIFF ANCHOR:\n" + f"```diff\n{pr_diff}\n```\n" + "Use this as a starting point only. You MUST validate findings with repository tool reads.\n" + ) + +def _base_context_block(pr_data: Dict[str, Any], pr_diff: str, max_diff_lines: int) -> str: + """Shared context block used across prompts.""" + files_changed = _format_files_changed(pr_data) + return f""" +PR CONTEXT: +- PR Number: {pr_data.get('number', 'unknown')} +- Title: {pr_data.get('title', 'unknown')} +- Author: {pr_data.get('user', 'unknown')} +- Repository: {pr_data.get('head', {}).get('repo', {}).get('full_name', 'unknown')} +- Files changed: {pr_data.get('changed_files', 0)} +- Lines added: {pr_data.get('additions', 0)} +- Lines deleted: {pr_data.get('deletions', 0)} +- PR body: {pr_data.get('body', '') or 'No description'} + +MODIFIED FILES: +{files_changed or '- None listed'} +{_build_hybrid_diff_section(pr_diff, max_diff_lines)} +MANDATORY CONTEXT VALIDATION RULES: +1. You MUST use repository tools to read each relevant changed file before finalizing findings. +2. For every finding, verify at least one additional contextual location (caller, callee, config, or sibling path). +3. Do not rely on inline diff alone, even when diff is fully present. +""" -def _build_diff_section(pr_diff, include_diff): - """Build prompt section for inline diff or agentic file reading.""" - if pr_diff and include_diff: - return f""" +def _findings_output_schema(extra_fields: str = "") -> str: + return f""" +OUTPUT JSON SCHEMA (exact keys): +{{ + "findings": [ + {{ + "file": "path/to/file.py", + "line": 42, + "severity": "HIGH|MEDIUM|LOW", + "category": "correctness|reliability|performance|maintainability|testing|security|compliance", + "title": "Short issue title", + "description": "What is wrong", + "impact": "Concrete failure mode or exploit path", + "recommendation": "Actionable fix", + "confidence": 0.93{extra_fields} + }} + ], + "analysis_summary": {{ + "files_reviewed": 0, + "high_severity": 0, + "medium_severity": 0, + "low_severity": 0, + "review_completed": true + }} +}} +""" + + +def build_triage_prompt(pr_data: Dict[str, Any], pr_diff: str, max_diff_lines: int) -> str: + """Prompt for triage phase.""" + return f""" +You are the triage specialist for a pull request review. -PR DIFF CONTENT: -``` -{pr_diff} -``` +{_base_context_block(pr_data, pr_diff, max_diff_lines)} -Review the complete diff above. This contains all code changes in the PR. +Decide whether review should be skipped. +Skip only if one of the following is true: +- PR is clearly trivial and cannot contain correctness/security/compliance risk +- PR is obviously generated deployment churn with no business logic changes +- There are no meaningful code changes in reviewed files + +Return JSON only: +{{ + "skip_review": false, + "reason": "short reason", + "risk_level": "low|medium|high" +}} """ - return """ -IMPORTANT - FILE READING INSTRUCTIONS: -You have access to the repository files. For each file listed above, use the Read tool to examine the changes. -Focus on the files that are most likely to contain issues based on the PR context. +def build_context_discovery_prompt(pr_data: Dict[str, Any], pr_diff: str, max_diff_lines: int) -> str: + """Prompt for context discovery phase.""" + return f""" +You are the repository context specialist. + +{_base_context_block(pr_data, pr_diff, max_diff_lines)} + +Tasks: +1. Find relevant CLAUDE.md files: root and those in changed-file parent paths. +2. Summarize PR intent and risky hotspots. +3. Identify top files for deep review. -To review effectively: -1. Read each modified file to understand the current code -2. Look at surrounding code context when needed to understand the changes -3. Check related files if you need to understand dependencies or usage patterns +Return JSON only: +{{ + "claude_md_files": ["path/CLAUDE.md"], + "change_summary": "brief summary", + "hotspots": ["path/to/file"], + "priority_files": ["path/to/file"] +}} """ -def get_unified_review_prompt( - pr_data, - pr_diff=None, - include_diff=True, - custom_review_instructions=None, - custom_security_instructions=None, - review_context=None, -): - """Generate unified code review + security prompt for Claude Code. +def build_compliance_prompt( + pr_data: Dict[str, Any], + pr_diff: str, + max_diff_lines: int, + discovered_context: Dict[str, Any], +) -> str: + """Prompt for CLAUDE.md compliance analysis.""" + context_json = discovered_context or {} + return f""" +You are the CLAUDE.md compliance specialist. - This prompt covers both code quality (correctness, reliability, performance, - maintainability, testing) and security in a single pass. +{_base_context_block(pr_data, pr_diff, max_diff_lines)} - Args: - pr_data: PR data dictionary from GitHub API - pr_diff: Optional complete PR diff in unified format - include_diff: Whether to include the diff in the prompt (default: True) - custom_review_instructions: Optional custom review instructions to append - custom_security_instructions: Optional custom security instructions to append - review_context: Optional previous review context (bot findings and user replies) +DISCOVERED CONTEXT: +{context_json} - Returns: - Formatted prompt string - """ +Focus exclusively on clear CLAUDE.md violations in changed code. Cite concrete violated rule text in each finding. +Reject ambiguous or preference-only claims. + + {_findings_output_schema(COMPLIANCE_EXTRA_FIELDS)} + +Return JSON only. +""" - files_changed = _format_files_changed(pr_data) - diff_section = _build_diff_section(pr_diff, include_diff) - - custom_review_section = "" - if custom_review_instructions: - custom_review_section = f"\n{custom_review_instructions}\n" - - custom_security_section = "" - if custom_security_instructions: - custom_security_section = f"\n{custom_security_instructions}\n" - - # Build PR description section - pr_description = pr_data.get('body', '').strip() if pr_data.get('body') else '' - pr_description_section = "" - if pr_description: - # Truncate very long descriptions - if len(pr_description) > 2000: - pr_description = pr_description[:2000] + "... (truncated)" - pr_description_section = f"\nPR Description:\n{pr_description}\n" - - # Build review context section (previous bot reviews and user replies) - review_context_section = "" - if review_context: - review_context_section = review_context +def build_quality_prompt( + pr_data: Dict[str, Any], + pr_diff: str, + max_diff_lines: int, + discovered_context: Dict[str, Any], + custom_review_instructions: Optional[str] = None, +) -> str: + """Prompt for code quality analysis.""" + custom_block = f"\nCUSTOM QUALITY INSTRUCTIONS:\n{custom_review_instructions}\n" if custom_review_instructions else "" return f""" -You are a senior engineer conducting a comprehensive code review of GitHub PR #{pr_data['number']}: "{pr_data['title']}" +You are the code quality specialist. -CONTEXT: -- Repository: {pr_data.get('head', {}).get('repo', {}).get('full_name', 'unknown')} -- Author: {pr_data['user']} -- Files changed: {pr_data['changed_files']} -- Lines added: {pr_data['additions']} -- Lines deleted: {pr_data['deletions']} -{pr_description_section} -Files modified: -{files_changed}{diff_section}{review_context_section} - -OBJECTIVE: -Perform a focused, high-signal code review to identify HIGH-CONFIDENCE issues introduced by this PR. This covers both code quality (correctness, reliability, performance, maintainability, testing) AND security. Do not comment on pre-existing issues or purely stylistic preferences. - -CRITICAL INSTRUCTIONS: -1. MINIMIZE FALSE POSITIVES: Only flag issues where you're >80% confident they are real and impactful -2. AVOID NOISE: Skip style nits, subjective preferences, or low-impact suggestions -3. FOCUS ON IMPACT: Prioritize bugs, regressions, data loss, significant performance problems, or security vulnerabilities -4. SCOPE: Only evaluate code introduced or modified in this PR. Ignore unrelated existing issues - -CODE QUALITY CATEGORIES: - -**Correctness & Logic:** -- Incorrect business logic or wrong results -- Edge cases or null/empty handling regressions -- Incorrect error handling or missing validations leading to bad state -- Invariants broken by changes - -**Reliability & Resilience:** -- Concurrency or race conditions introduced by changes -- Resource leaks, timeouts, or missing retries in critical paths -- Partial failure handling or inconsistent state updates -- Idempotency or ordering issues - -**Performance & Scalability:** -- Algorithmic regressions in hot paths (O(n^2) where O(n) expected) -- N+1 query patterns -- Excessive synchronous I/O in latency-sensitive code -- Unbounded memory growth introduced by changes - -**Maintainability & Design:** -- Changes that significantly increase complexity or make future changes risky -- Tight coupling or unclear responsibility boundaries introduced -- Misleading APIs or brittle contracts - -**Testing & Observability:** -- Missing tests for high-risk changes -- Lack of logging/metrics around new critical behavior -- Flaky behavior due to nondeterministic changes -{custom_review_section} -SECURITY CATEGORIES: - -**Input Validation Vulnerabilities:** -- SQL injection via unsanitized user input -- Command injection in system calls or subprocesses -- XXE injection in XML parsing -- Template injection in templating engines -- NoSQL injection in database queries -- Path traversal in file operations - -**Authentication & Authorization Issues:** -- Authentication bypass logic -- Privilege escalation paths -- Session management flaws -- JWT token vulnerabilities -- Authorization logic bypasses - -**Crypto & Secrets Management:** -- Hardcoded API keys, passwords, or tokens -- Weak cryptographic algorithms or implementations -- Improper key storage or management -- Cryptographic randomness issues -- Certificate validation bypasses - -**Injection & Code Execution:** -- Remote code execution via deserialization -- Pickle injection in Python -- YAML deserialization vulnerabilities -- Eval injection in dynamic code execution -- XSS vulnerabilities in web applications (reflected, stored, DOM-based) - -**Data Exposure:** -- Sensitive data logging or storage -- PII handling violations -- API endpoint data leakage -- Debug information exposure -{custom_security_section} -EXCLUSIONS - DO NOT REPORT: -- Denial of Service (DOS) vulnerabilities or resource exhaustion attacks -- Secrets/credentials stored on disk (these are managed separately) -- Rate limiting concerns or service overload scenarios - -Additional notes: -- Even if something is only exploitable from the local network, it can still be a HIGH severity issue - -ANALYSIS METHODOLOGY: - -Phase 1 - Repository Context Research (Use file search tools): -- Identify existing patterns, conventions, and critical paths -- Understand data flow, invariants, and error handling expectations -- Look for established security frameworks and patterns - -Phase 2 - Comparative Analysis: -- Compare new changes to existing patterns and contracts -- Identify deviations that introduce risk, regressions, or security issues -- Look for inconsistent handling between similar code paths - -Phase 3 - Issue Assessment: -- Examine each modified file for code quality and security implications -- Trace data flow from inputs to sensitive operations -- Identify concurrency, state management, and injection risks - -REQUIRED OUTPUT FORMAT: - -You MUST output your findings as structured JSON with this exact schema: +{_base_context_block(pr_data, pr_diff, max_diff_lines)} + +DISCOVERED CONTEXT: +{discovered_context or {}} + +Focus on high-signal issues only: +- correctness and logic defects +- reliability regressions +- significant performance regressions +- maintainability risks with concrete failure/bug potential +- testing gaps only when they block confidence for risky behavior + +Exclude style-only feedback and speculative concerns. +{custom_block} +{_findings_output_schema()} + +Return JSON only. +""" + + +def build_security_prompt( + pr_data: Dict[str, Any], + pr_diff: str, + max_diff_lines: int, + discovered_context: Dict[str, Any], + custom_security_instructions: Optional[str] = None, +) -> str: + """Prompt for security analysis with explicit exploitability criteria.""" + custom_block = ( + f"\nCUSTOM SECURITY INSTRUCTIONS:\n{custom_security_instructions}\n" + if custom_security_instructions + else "" + ) + + return f""" +You are the security specialist. + +{_base_context_block(pr_data, pr_diff, max_diff_lines)} + +DISCOVERED CONTEXT: +{discovered_context or {}} + +Security review scope: +- injection (SQL/command/template/NoSQL/path traversal) +- authn/authz bypass and privilege escalation +- unsafe deserialization and code execution paths +- crypto and secrets handling flaws +- sensitive data exposure and trust-boundary breaks + +For every security finding you MUST provide: +1. exploit or abuse path +2. required attacker preconditions +3. impacted trust boundary or sensitive asset +4. concrete mitigation + +Do NOT report: +- generic DoS/rate-limiting comments without concrete exploitability +- speculative attacks without evidence in changed code paths +- issues outside modified scope unless required to prove exploitability +{custom_block} + {_findings_output_schema(SECURITY_EXTRA_FIELDS)} + +Return JSON only. +""" + + +def build_validation_prompt( + pr_data: Dict[str, Any], + pr_diff: str, + max_diff_lines: int, + candidate_findings: List[Dict[str, Any]], +) -> str: + """Prompt for finding validation and deduplication support.""" + return f""" +You are the validation specialist. + +{_base_context_block(pr_data, pr_diff, max_diff_lines)} + +CANDIDATE FINDINGS: +{candidate_findings} +Validate each finding with strict criteria: +- must be reproducible or clearly inferable from changed code and surrounding context +- must have concrete impact +- confidence must be >= 0.8 to keep +- if two findings are duplicates, keep the stronger one only + +Return JSON only: {{ - "pr_summary": {{ - "overview": "2-4 sentence summary of what this PR changes and why it matters", - "file_changes": [ - {{ - "label": "src/auth.py", - "files": ["src/auth.py"], - "changes": "Brief description of changes (~10 words)" - }}, - {{ - "label": "tests/test_*.py", - "files": ["tests/test_auth.py", "tests/test_login.py"], - "changes": "Unit tests for authentication" - }} - ] - }}, - "findings": [ + "validated_findings": [ {{ - "file": "path/to/file.py", - "line": 42, - "severity": "HIGH", - "category": "correctness|reliability|performance|maintainability|testing|security", - "title": "Short summary of the issue", - "description": "What is wrong and where it happens", - "impact": "Concrete impact or failure mode (use exploit scenario for security issues)", - "recommendation": "Actionable fix or mitigation", - "suggestion": "Exact replacement code (optional). Can be multi-line. Must replace lines from suggestion_start_line to suggestion_end_line.", - "suggestion_start_line": 42, - "suggestion_end_line": 44, - "confidence": 0.95 + "finding_index": 0, + "keep": true, + "confidence": 0.92, + "reason": "short reason" }} ] }} - -PR SUMMARY GUIDELINES: -- overview: 2-4 sentences describing WHAT changed and WHY (purpose/goal) -- file_changes: One entry per file or group of related files - - label: Display name (single file path, or pattern like "tests/*.py" for groups) - - files: Array of actual file paths covered by this entry (used for counting) - - changes: Brief description (~10 words), focus on purpose not implementation - - Group related files when it improves readability (e.g., test files, config files) - -SUGGESTION GUIDELINES: -- Only include `suggestion` if you can provide exact, working replacement code -- For single-line fixes: set suggestion_start_line = suggestion_end_line = the line number -- For multi-line fixes: set the range of lines being replaced -- The suggestion replaces all lines from suggestion_start_line to suggestion_end_line (inclusive) - -SEVERITY GUIDELINES: -- **HIGH**: Likely production bug, data loss, significant regression, or directly exploitable security vulnerability -- **MEDIUM**: Real issue with limited scope or specific triggering conditions -- **LOW**: Minor but real issue; use sparingly and only if clearly actionable - -CONFIDENCE SCORING: -- 0.9-1.0: Certain issue with clear evidence and impact -- 0.8-0.9: Strong signal with likely real-world impact -- 0.7-0.8: Plausible issue but may require specific conditions -- Below 0.7: Don't report (too speculative) - -FINAL REMINDER: -Focus on HIGH and MEDIUM findings only. Better to miss some theoretical issues than flood the report with false positives. Each finding should be something a senior engineer would confidently raise in a PR review. - -Begin your analysis now. Use the repository exploration tools to understand the codebase context, then analyze the PR changes for code quality and security implications. - -Your final reply must contain the JSON and nothing else. You should not reply again after outputting the JSON. """ + diff --git a/claudecode/review_orchestrator.py b/claudecode/review_orchestrator.py new file mode 100644 index 0000000..1a8ca55 --- /dev/null +++ b/claudecode/review_orchestrator.py @@ -0,0 +1,265 @@ +"""Multi-phase review orchestration for GitHub Action execution.""" + +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple + +from claudecode.findings_merge import merge_findings +from claudecode.json_parser import parse_json_with_fallbacks +from claudecode.logger import get_logger +from claudecode.prompts import ( + build_compliance_prompt, + build_context_discovery_prompt, + build_quality_prompt, + build_security_prompt, + build_triage_prompt, + build_validation_prompt, +) + +logger = get_logger(__name__) + + +@dataclass +class ReviewModelConfig: + """Per-phase model configuration with global fallback.""" + + triage: str + compliance: str + quality: str + security: str + validation: str + + @classmethod + def from_env(cls, env: Dict[str, str], default_model: str) -> "ReviewModelConfig": + def resolve(key: str, fallback: str) -> str: + value = (env.get(key) or "").strip() + return value or fallback + + global_model = resolve("CLAUDE_MODEL", default_model) + return cls( + triage=resolve("MODEL_TRIAGE", global_model), + compliance=resolve("MODEL_COMPLIANCE", global_model), + quality=resolve("MODEL_QUALITY", global_model), + security=resolve("MODEL_SECURITY", global_model), + validation=resolve("MODEL_VALIDATION", global_model), + ) + + +class ReviewOrchestrator: + """Coordinates multi-phase review and returns final findings.""" + + def __init__( + self, + claude_runner: Any, + findings_filter: Any, + github_client: Any, + model_config: ReviewModelConfig, + max_diff_lines: int, + ): + self.claude_runner = claude_runner + self.findings_filter = findings_filter + self.github_client = github_client + self.model_config = model_config + self.max_diff_lines = max(0, max_diff_lines) + + def _run_phase(self, repo_dir: Path, prompt: str, model: str, phase_name: str) -> Tuple[bool, Dict[str, Any], str]: + run_prompt = getattr(self.claude_runner, "run_prompt", None) + if not callable(run_prompt): + return False, {}, f"Runner missing run_prompt for {phase_name}" + + raw_result = run_prompt(repo_dir, prompt, model=model) + if not (isinstance(raw_result, tuple) and len(raw_result) == 3): + return False, {}, f"Invalid runner response for {phase_name}" + + success, error_msg, raw = raw_result + if not success: + return False, {}, error_msg + + if isinstance(raw, dict): + return True, raw, "" + + parsed_ok, parsed = parse_json_with_fallbacks(str(raw), f"{phase_name} output") + if not parsed_ok: + return False, {}, f"Failed to parse {phase_name} output" + return True, parsed, "" + + def _is_excluded(self, filepath: str) -> bool: + checker = getattr(self.github_client, "is_excluded", None) + if callable(checker): + return bool(checker(filepath)) + raise AttributeError("github_client must implement is_excluded(filepath)") + + def _collect_phase_findings(self, phase_result: Dict[str, Any], source_agent: str) -> List[Dict[str, Any]]: + findings: List[Dict[str, Any]] = [] + for finding in phase_result.get("findings", []): + if isinstance(finding, dict): + enriched = finding.copy() + enriched.setdefault("source_agent", source_agent) + category = str(enriched.get("category", "")).lower() + if "review_type" not in enriched: + enriched["review_type"] = "security" if category == "security" else "general" + findings.append(enriched) + return findings + + def _ensure_review_type(self, finding: Dict[str, Any]) -> Dict[str, Any]: + enriched = finding.copy() + category = str(enriched.get("category", "")).lower() + if "review_type" not in enriched: + enriched["review_type"] = "security" if category == "security" else "general" + return enriched + + def run( + self, + repo_dir: Path, + pr_data: Dict[str, Any], + pr_diff: str, + custom_review_instructions: Optional[str] = None, + custom_security_instructions: Optional[str] = None, + ) -> Tuple[bool, Dict[str, Any], str]: + # Phase A: triage + triage_prompt = build_triage_prompt(pr_data, pr_diff, self.max_diff_lines) + ok, triage_result, err = self._run_phase(repo_dir, triage_prompt, self.model_config.triage, "triage") + if not ok: + return False, {}, f"Triage phase failed: {err}" + + if not isinstance(triage_result, dict) or "skip_review" not in triage_result: + return False, {}, "Triage phase returned invalid schema" + + if triage_result.get("skip_review") is True: + logger.info("Skipping review based on triage decision: %s", triage_result.get("reason", "")) + return True, { + "findings": [], + "analysis_summary": { + "files_reviewed": 0, + "high_severity": 0, + "medium_severity": 0, + "low_severity": 0, + "review_completed": True, + }, + "triage": triage_result, + }, "" + + # Phase B: context discovery + context_prompt = build_context_discovery_prompt(pr_data, pr_diff, self.max_diff_lines) + ok, context_result, err = self._run_phase(repo_dir, context_prompt, self.model_config.compliance, "context discovery") + if not ok: + return False, {}, f"Context discovery phase failed: {err}" + + # Phase C: specialist passes + compliance_prompt = build_compliance_prompt(pr_data, pr_diff, self.max_diff_lines, context_result) + quality_prompt = build_quality_prompt( + pr_data, + pr_diff, + self.max_diff_lines, + context_result, + custom_review_instructions=custom_review_instructions, + ) + security_prompt = build_security_prompt( + pr_data, + pr_diff, + self.max_diff_lines, + context_result, + custom_security_instructions=custom_security_instructions, + ) + + ok_c, compliance_result, err_c = self._run_phase(repo_dir, compliance_prompt, self.model_config.compliance, "compliance") + ok_q, quality_result, err_q = self._run_phase(repo_dir, quality_prompt, self.model_config.quality, "quality") + ok_s, security_result, err_s = self._run_phase(repo_dir, security_prompt, self.model_config.security, "security") + + if not ok_c: + return False, {}, f"Compliance phase failed: {err_c}" + if not ok_q: + return False, {}, f"Quality phase failed: {err_q}" + if not ok_s: + return False, {}, f"Security phase failed: {err_s}" + + all_candidates: List[Dict[str, Any]] = [] + all_candidates.extend(self._collect_phase_findings(compliance_result, "compliance")) + all_candidates.extend(self._collect_phase_findings(quality_result, "quality")) + all_candidates.extend(self._collect_phase_findings(security_result, "security")) + + all_candidates = merge_findings(all_candidates) + + # Phase D: validation + validation_prompt = build_validation_prompt(pr_data, pr_diff, self.max_diff_lines, all_candidates) + ok_v, validation_result, err_v = self._run_phase(repo_dir, validation_prompt, self.model_config.validation, "validation") + if not ok_v: + return False, {}, f"Validation phase failed: {err_v}" + + validated: List[Dict[str, Any]] = [] + has_validation_output = isinstance(validation_result, dict) and "validated_findings" in validation_result + decisions = validation_result.get("validated_findings", []) if isinstance(validation_result, dict) else [] + if not isinstance(decisions, list): + decisions = [] + for decision in decisions: + if not isinstance(decision, dict): + continue + idx = decision.get("finding_index") + keep = bool(decision.get("keep")) + confidence = decision.get("confidence", 0.0) + try: + idx_int = int(idx) + except (TypeError, ValueError): + continue + if idx_int < 0 or idx_int >= len(all_candidates): + continue + if keep and float(confidence or 0.0) >= 0.8: + finding = all_candidates[idx_int].copy() + finding["confidence"] = float(confidence) + validated.append(finding) + + # If validator did not return decisions at all, preserve candidates. + # If it explicitly returned validated_findings (including empty list), trust validator output. + if not has_validation_output: + validated = all_candidates + + # Apply existing filtering pipeline + pr_context = { + "repo_name": pr_data.get("head", {}).get("repo", {}).get("full_name", "unknown"), + "pr_number": pr_data.get("number"), + "title": pr_data.get("title", ""), + "description": pr_data.get("body", ""), + } + kept_findings = validated + original_count = len(all_candidates) + filter_response = self.findings_filter.filter_findings(validated, pr_context) + if isinstance(filter_response, tuple) and len(filter_response) == 3: + filter_success, filter_results, _ = filter_response + if filter_success and isinstance(filter_results, dict): + kept_findings = filter_results.get("filtered_findings", validated) + + final_findings: List[Dict[str, Any]] = [] + for finding in kept_findings: + if not isinstance(finding, dict): + continue + if self._is_excluded(finding.get("file", "")): + continue + final_findings.append(self._ensure_review_type(finding)) + + high = len([f for f in final_findings if str(f.get("severity", "")).upper() == "HIGH"]) + medium = len([f for f in final_findings if str(f.get("severity", "")).upper() == "MEDIUM"]) + low = len([f for f in final_findings if str(f.get("severity", "")).upper() == "LOW"]) + + files_reviewed = pr_data.get("changed_files", 0) + try: + files_reviewed = int(files_reviewed) + except (TypeError, ValueError): + files_reviewed = 0 + + return True, { + "findings": final_findings, + "analysis_summary": { + "files_reviewed": files_reviewed, + "high_severity": high, + "medium_severity": medium, + "low_severity": low, + "review_completed": True, + }, + "filtering_summary": { + "total_original_findings": original_count, + "excluded_findings": max(0, original_count - len(final_findings)), + "kept_findings": len(final_findings), + }, + "context_summary": context_result, + "triage": triage_result, + }, "" diff --git a/claudecode/test_claude_runner.py b/claudecode/test_claude_runner.py index 296fb78..403894b 100644 --- a/claudecode/test_claude_runner.py +++ b/claudecode/test_claude_runner.py @@ -1,438 +1,132 @@ #!/usr/bin/env python3 -""" -Unit tests for SimpleClaudeRunner. -""" +"""Unit tests for SimpleClaudeRunner.""" import json import os import subprocess -from unittest.mock import Mock, patch from pathlib import Path +from unittest.mock import Mock, patch -from claudecode.github_action_audit import SimpleClaudeRunner from claudecode.constants import DEFAULT_CLAUDE_MODEL +from claudecode.github_action_audit import SimpleClaudeRunner class TestSimpleClaudeRunner: - """Test SimpleClaudeRunner functionality.""" - def test_init(self): - """Test runner initialization.""" runner = SimpleClaudeRunner(timeout_minutes=30) assert runner.timeout_seconds == 1800 - - runner2 = SimpleClaudeRunner() # Default - assert runner2.timeout_seconds == 1200 # 20 minutes default - - @patch('subprocess.run') + + runner2 = SimpleClaudeRunner() + assert runner2.timeout_seconds == 1200 + + @patch("subprocess.run") def test_validate_claude_available_success(self, mock_run): - """Test successful Claude validation.""" - mock_run.return_value = Mock( - returncode=0, - stdout='claude version 1.0.0', - stderr='' - ) - - with patch.dict(os.environ, {'ANTHROPIC_API_KEY': 'test-key'}): + mock_run.return_value = Mock(returncode=0, stdout="claude version 1.0.0", stderr="") + + with patch.dict(os.environ, {"ANTHROPIC_API_KEY": "test-key"}): runner = SimpleClaudeRunner() success, error = runner.validate_claude_available() - + assert success is True - assert error == '' - mock_run.assert_called_once_with( - ['claude', '--version'], - capture_output=True, - text=True, - timeout=10 - ) - - @patch('subprocess.run') + assert error == "" + + @patch("subprocess.run") def test_validate_claude_available_no_api_key(self, mock_run): - """Test Claude validation without API key.""" - mock_run.return_value = Mock( - returncode=0, - stdout='claude version 1.0.0', - stderr='' - ) - - # Remove API key + mock_run.return_value = Mock(returncode=0, stdout="claude version 1.0.0", stderr="") + env = os.environ.copy() - env.pop('ANTHROPIC_API_KEY', None) - + env.pop("ANTHROPIC_API_KEY", None) with patch.dict(os.environ, env, clear=True): runner = SimpleClaudeRunner() success, error = runner.validate_claude_available() - - assert success is False - assert 'ANTHROPIC_API_KEY environment variable is not set' in error - - @patch('subprocess.run') - def test_validate_claude_available_not_installed(self, mock_run): - """Test Claude validation when not installed.""" - mock_run.side_effect = FileNotFoundError() - - runner = SimpleClaudeRunner() - success, error = runner.validate_claude_available() - - assert success is False - assert 'Claude Code is not installed or not in PATH' in error - - @patch('subprocess.run') - def test_validate_claude_available_error(self, mock_run): - """Test Claude validation with error.""" - mock_run.return_value = Mock( - returncode=1, - stdout='', - stderr='Error: Authentication failed' - ) - - runner = SimpleClaudeRunner() - success, error = runner.validate_claude_available() - - assert success is False - assert 'exit code 1' in error - assert 'Authentication failed' in error - - @patch('subprocess.run') - def test_validate_claude_available_timeout(self, mock_run): - """Test Claude validation timeout.""" - mock_run.side_effect = subprocess.TimeoutExpired(['claude'], 10) - - runner = SimpleClaudeRunner() - success, error = runner.validate_claude_available() - + assert success is False - assert 'timed out' in error - - def test_run_code_review_missing_directory(self): - """Test audit with missing directory.""" + assert "ANTHROPIC_API_KEY" in error + + def test_run_prompt_missing_directory(self): runner = SimpleClaudeRunner() - success, error, results = runner.run_code_review( - Path('/non/existent/path'), - "test prompt" - ) - + success, error, results = runner.run_prompt(Path("/non/existent/path"), "test prompt") + assert success is False - assert 'Repository directory does not exist' in error + assert "Repository directory does not exist" in error assert results == {} - - @patch('subprocess.run') - def test_run_code_review_success(self, mock_run): - """Test successful code review.""" - # Claude Code returns wrapped format with 'result' field - findings_data = { - "pr_summary": { - "overview": "Test PR summary", - "file_changes": [ - {"label": "test.py", "files": ["test.py"], "changes": "Test changes"} - ] - }, - "findings": [ - { - "file": "test.py", - "line": 10, - "severity": "HIGH", - "description": "SQL injection vulnerability" - } - ] - } - - audit_result = { - "result": json.dumps(findings_data) - } - - mock_run.return_value = Mock( - returncode=0, - stdout=json.dumps(audit_result), - stderr='' - ) - + + @patch("subprocess.run") + def test_run_prompt_success_with_nested_json(self, mock_run): + wrapped = {"result": json.dumps({"findings": [{"file": "test.py", "line": 1}]})} + mock_run.return_value = Mock(returncode=0, stdout=json.dumps(wrapped), stderr="") + runner = SimpleClaudeRunner() - with patch('pathlib.Path.exists', return_value=True): - success, error, results = runner.run_code_review( - Path('/tmp/test'), - "test prompt" - ) - + with patch("pathlib.Path.exists", return_value=True): + success, error, results = runner.run_prompt(Path("/tmp/test"), "test prompt") + assert success is True - assert error == '' - assert len(results['findings']) == 1 - assert results['findings'][0]['severity'] == 'HIGH' - - # Verify subprocess call - mock_run.assert_called_once() - call_args = mock_run.call_args - cmd = call_args[0][0] - assert cmd[0] == 'claude' - assert '--output-format' in cmd - assert 'json' in cmd - assert '--model' in cmd + assert error == "" + assert results["findings"][0]["file"] == "test.py" + + cmd = mock_run.call_args[0][0] + assert cmd[0] == "claude" + assert "--output-format" in cmd + assert "json" in cmd + assert "--model" in cmd assert DEFAULT_CLAUDE_MODEL in cmd - assert '--disallowed-tools' in cmd - assert 'Bash(ps:*)' in cmd - assert '--json-schema' in cmd - assert call_args[1]['input'] == 'test prompt' - assert call_args[1]['cwd'] == Path('/tmp/test') - - @patch('subprocess.run') - def test_run_code_review_large_prompt_warning(self, mock_run, capsys): - """Test warning for large prompts.""" - mock_run.return_value = Mock( - returncode=0, - stdout='{"findings": []}', - stderr='' - ) - - # Create a prompt larger than 1MB - large_prompt = 'x' * (1024 * 1024 + 1000) - - runner = SimpleClaudeRunner() - with patch('pathlib.Path.exists', return_value=True): - success, error, results = runner.run_code_review( - Path('/tmp/test'), - large_prompt - ) - - captured = capsys.readouterr() - assert '[Warning] Large prompt size' in captured.err - assert success is True - - @patch('subprocess.run') - def test_run_code_review_retry_on_failure(self, mock_run): - """Test retry logic on failure.""" - # First call fails, second succeeds - mock_run.side_effect = [ - Mock(returncode=1, stdout='', stderr='Temporary error'), - Mock(returncode=0, stdout='{"findings": []}', stderr='') - ] - + assert "--disallowed-tools" in cmd + assert "Bash(ps:*)" in cmd + + @patch("subprocess.run") + def test_run_prompt_adds_json_schema_when_provided(self, mock_run): + mock_run.return_value = Mock(returncode=0, stdout='{"ok": true}', stderr="") + runner = SimpleClaudeRunner() - with patch('pathlib.Path.exists', return_value=True): - success, error, results = runner.run_code_review( - Path('/tmp/test'), - "test prompt" + with patch("pathlib.Path.exists", return_value=True): + success, _, _ = runner.run_prompt( + Path("/tmp/test"), + "test prompt", + json_schema={"type": "object", "properties": {"ok": {"type": "boolean"}}}, ) - + assert success is True - assert error == '' - assert mock_run.call_count == 2 # Retried once - - @patch('subprocess.run') - def test_run_code_review_retry_on_error_during_execution(self, mock_run): - """Test retry on error_during_execution result.""" - error_result = { - "type": "result", - "subtype": "error_during_execution", - "error": "Temporary execution error" - } - - success_result = { - "result": json.dumps({ - "pr_summary": { - "overview": "Test", - "file_changes": [{"label": "test.py", "files": ["test.py"], "changes": "Test"}] - }, - "findings": [{"file": "test.py", "line": 1, "severity": "LOW", "description": "Issue"}] - }) - } - + cmd = mock_run.call_args[0][0] + assert "--json-schema" in cmd + + @patch("subprocess.run") + def test_run_prompt_retry_on_failure(self, mock_run): mock_run.side_effect = [ - Mock(returncode=0, stdout=json.dumps(error_result), stderr=''), - Mock(returncode=0, stdout=json.dumps(success_result), stderr='') + Mock(returncode=1, stdout="", stderr="Temporary error"), + Mock(returncode=0, stdout='{"findings": []}', stderr=""), ] - + runner = SimpleClaudeRunner() - with patch('pathlib.Path.exists', return_value=True): - success, error, results = runner.run_code_review( - Path('/tmp/test'), - "test prompt" - ) - + with patch("pathlib.Path.exists", return_value=True): + success, error, _ = runner.run_prompt(Path("/tmp/test"), "test prompt") + assert success is True - assert len(results['findings']) == 1 + assert error == "" assert mock_run.call_count == 2 - - @patch('subprocess.run') - def test_run_code_review_timeout(self, mock_run): - """Test timeout handling.""" - mock_run.side_effect = subprocess.TimeoutExpired(['claude'], 1200) - - runner = SimpleClaudeRunner() - with patch('pathlib.Path.exists', return_value=True): - success, error, results = runner.run_code_review( - Path('/tmp/test'), - "test prompt" - ) - - assert success is False - assert 'timed out after 20 minutes' in error - assert results == {} - - @patch('subprocess.run') - def test_run_code_review_json_parse_failure_with_retry(self, mock_run): - """Test JSON parse failure with retry.""" + + @patch("subprocess.run") + def test_run_prompt_json_parse_failure(self, mock_run): mock_run.side_effect = [ - Mock(returncode=0, stdout='Invalid JSON', stderr=''), - Mock(returncode=0, stdout='Still invalid', stderr='') + Mock(returncode=0, stdout="Invalid JSON", stderr=""), + Mock(returncode=0, stdout="Still invalid", stderr=""), ] - - runner = SimpleClaudeRunner() - with patch('pathlib.Path.exists', return_value=True): - success, error, results = runner.run_code_review( - Path('/tmp/test'), - "test prompt" - ) - - assert success is False - assert 'Failed to parse Claude output' in error - assert mock_run.call_count == 2 - - def test_extract_review_findings_claude_wrapper(self): - """Test extraction from Claude Code wrapper format.""" - runner = SimpleClaudeRunner() - - # Test with result field containing JSON string - claude_output = { - "result": json.dumps({ - "pr_summary": {"overview": "Test", "file_changes": []}, - "findings": [ - {"file": "test.py", "line": 10, "severity": "HIGH"} - ] - }) - } - result = runner._extract_review_findings(claude_output) - assert len(result['findings']) == 1 - assert result['findings'][0]['file'] == 'test.py' - - def test_extract_review_findings_direct_format(self): - """Test that direct findings format was removed - only wrapped format is supported.""" runner = SimpleClaudeRunner() + with patch("pathlib.Path.exists", return_value=True): + success, error, _ = runner.run_prompt(Path("/tmp/test"), "test prompt") - # Direct format (without 'result' wrapper) should return empty - claude_output = { - "findings": [ - {"file": "main.py", "line": 20, "severity": "MEDIUM"} - ], - "pr_summary": { - "overview": "Test", - "file_changes": [] - } - } - - result = runner._extract_review_findings(claude_output) - # Should return empty structure since direct format is not supported - assert len(result['findings']) == 0 - assert result['pr_summary'] == {} - - def test_extract_review_findings_text_fallback(self): - """Test that text fallback was removed - only JSON is supported.""" - runner = SimpleClaudeRunner() - - # Test with result containing text (not JSON) - claude_output = { - "result": "Found SQL injection vulnerability in database.py line 45" - } + assert success is False + assert "Failed to parse Claude output" in error + assert mock_run.call_count == 2 - # Should return empty findings since we don't parse text anymore - result = runner._extract_review_findings(claude_output) - assert len(result['findings']) == 0 - assert result['pr_summary'] == {} - - def test_extract_review_findings_empty(self): - """Test extraction with no findings.""" - runner = SimpleClaudeRunner() + @patch("subprocess.run") + def test_run_prompt_timeout(self, mock_run): + mock_run.side_effect = subprocess.TimeoutExpired(["claude"], 1200) - # Various empty formats - for output in [None, {}, {"result": ""}, {"other": "data"}]: - result = runner._extract_review_findings(output) - assert result['findings'] == [] - assert result['pr_summary'] == {} - - def test_create_findings_from_text(self): - """Test that _create_findings_from_text was removed.""" runner = SimpleClaudeRunner() - - # Method should not exist - assert not hasattr(runner, '_create_findings_from_text') - - def test_create_findings_from_text_no_issues(self): - """Test that _create_findings_from_text was removed.""" - runner = SimpleClaudeRunner() - - # Method should not exist - assert not hasattr(runner, '_create_findings_from_text') - + with patch("pathlib.Path.exists", return_value=True): + success, error, results = runner.run_prompt(Path("/tmp/test"), "test prompt") -class TestClaudeRunnerEdgeCases: - """Test edge cases and error scenarios.""" - - @patch('subprocess.run') - def test_claude_output_formats(self, mock_run): - """Test various Claude output formats.""" - runner = SimpleClaudeRunner() - - # Test nested JSON in result - result field should be string - nested_output = { - "type": "result", - "result": json.dumps({ - "pr_summary": {"overview": "Test", "file_changes": []}, - "findings": [ - {"file": "test.py", "line": 1, "severity": "HIGH", "description": "Issue"} - ] - }) - } - - with patch('pathlib.Path.exists', return_value=True): - mock_run.return_value = Mock( - returncode=0, - stdout=json.dumps(nested_output), - stderr='' - ) - - success, error, results = runner.run_code_review( - Path('/tmp/test'), - "test" - ) - - # Should extract findings from nested structure - assert success is True - assert len(results['findings']) == 1 - - @patch('subprocess.run') - def test_partial_json_recovery(self, mock_run): - """Test recovery from partial JSON output.""" - # Simulate truncated JSON - partial_json = '{"findings": [{"file": "test.py", "line": 10, "sev' - - mock_run.return_value = Mock( - returncode=0, - stdout=partial_json, - stderr='' - ) - - runner = SimpleClaudeRunner() - with patch('pathlib.Path.exists', return_value=True): - success, error, results = runner.run_code_review( - Path('/tmp/test'), - "test" - ) - - # Should fail to parse and retry - assert mock_run.call_count == 2 - - @patch('subprocess.run') - def test_exception_handling(self, mock_run): - """Test general exception handling.""" - mock_run.side_effect = Exception("Unexpected error") - - runner = SimpleClaudeRunner() - with patch('pathlib.Path.exists', return_value=True): - success, error, results = runner.run_code_review( - Path('/tmp/test'), - "test" - ) - assert success is False - assert 'Unexpected error' in error + assert "timed out" in error assert results == {} diff --git a/claudecode/test_findings_merge.py b/claudecode/test_findings_merge.py new file mode 100644 index 0000000..9129e7a --- /dev/null +++ b/claudecode/test_findings_merge.py @@ -0,0 +1,32 @@ +"""Unit tests for findings_merge utilities.""" + +from claudecode.findings_merge import merge_findings + + +def test_merge_findings_empty_input(): + assert merge_findings([]) == [] + + +def test_merge_findings_keeps_higher_severity_then_confidence(): + findings = [ + {"file": "a.py", "line": 10, "category": "security", "title": "Issue", "severity": "MEDIUM", "confidence": 0.9}, + {"file": "a.py", "line": 10, "category": "security", "title": "Issue", "severity": "HIGH", "confidence": 0.8}, + {"file": "a.py", "line": 10, "category": "security", "title": "Issue", "severity": "HIGH", "confidence": 0.95}, + ] + + merged = merge_findings(findings) + assert len(merged) == 1 + assert merged[0]["severity"] == "HIGH" + assert merged[0]["confidence"] == 0.95 + + +def test_merge_findings_separate_keys_are_preserved(): + findings = [ + {"file": "a.py", "line": 1, "category": "correctness", "title": "One", "severity": "LOW"}, + {"file": "a.py", "line": 2, "category": "correctness", "title": "Two", "severity": "LOW"}, + {"file": "b.py", "line": 1, "category": "security", "title": "One", "severity": "MEDIUM"}, + ] + + merged = merge_findings(findings) + assert len(merged) == 3 + diff --git a/claudecode/test_format_pr_comments.py b/claudecode/test_format_pr_comments.py deleted file mode 100644 index b96fbc3..0000000 --- a/claudecode/test_format_pr_comments.py +++ /dev/null @@ -1,372 +0,0 @@ -"""Unit tests for the format_pr_comments module.""" - -import pytest -from claudecode.format_pr_comments import ( - format_pr_comments_for_prompt, - is_bot_comment, - _truncate_replies, - _parse_bot_comment, - _format_timestamp, - BOT_COMMENT_MARKER, -) - - -def _make_bot_comment( - comment_id: int, - file_path: str = "app.py", - line: int = 42, - title: str = "SQL Injection", - severity: str = "HIGH", - category: str = "security", - impact: str = "Attacker can access database", - recommendation: str = "Use parameterized queries", - created_at: str = "2024-01-15T10:00:00Z", -): - """Create a bot comment for testing.""" - body = f"""{BOT_COMMENT_MARKER} {title}** - -**Severity:** {severity} -**Category:** {category} - -**Impact:** {impact} - -**Recommendation:** {recommendation} -""" - return { - 'id': comment_id, - 'body': body, - 'path': file_path, - 'line': line, - 'created_at': created_at, - 'user': {'login': 'github-actions[bot]', 'type': 'Bot'}, - 'in_reply_to_id': None, - } - - -def _make_user_reply( - comment_id: int, - in_reply_to_id: int, - body: str, - username: str = "testuser", - created_at: str = "2024-01-15T11:00:00Z", -): - """Create a user reply comment for testing.""" - return { - 'id': comment_id, - 'body': body, - 'path': 'app.py', - 'line': 42, - 'created_at': created_at, - 'user': {'login': username, 'type': 'User'}, - 'in_reply_to_id': in_reply_to_id, - } - - -def _make_thread(bot_comment, replies=None, reactions=None): - """Create a thread structure for testing.""" - return { - 'bot_comment': bot_comment, - 'replies': replies or [], - 'reactions': reactions or {}, - } - - -class TestFormatPrCommentsForPrompt: - """Tests for format_pr_comments_for_prompt function.""" - - def test_no_threads(self): - """Test with no threads.""" - result = format_pr_comments_for_prompt([]) - assert result == "" - - def test_single_thread_no_replies(self): - """Test with single bot comment, no replies.""" - bot_comment = _make_bot_comment(1) - threads = [_make_thread(bot_comment)] - - result = format_pr_comments_for_prompt(threads) - - assert "PREVIOUS REVIEW CONTEXT" in result - assert "SQL Injection" in result - assert "HIGH" in result - assert "security" in result - assert "app.py:42" in result - - def test_thread_with_user_replies(self): - """Test with bot comment and user replies.""" - bot_comment = _make_bot_comment(1) - user_reply = _make_user_reply( - 2, 1, - "This is a false positive - we sanitize input at the controller level.", - username="alice" - ) - threads = [_make_thread(bot_comment, replies=[user_reply])] - - result = format_pr_comments_for_prompt(threads) - - assert "PREVIOUS REVIEW CONTEXT" in result - assert "SQL Injection" in result - assert "alice" in result - assert "false positive" in result - assert "sanitize input" in result - - def test_multiple_threads(self): - """Test with multiple bot findings and replies.""" - bot_comment1 = _make_bot_comment( - 1, "auth.py", 10, "Hardcoded Password", "HIGH", "security" - ) - bot_comment2 = _make_bot_comment( - 2, "api.py", 50, "N+1 Query", "MEDIUM", "performance", - created_at="2024-01-15T10:30:00Z" - ) - reply1 = _make_user_reply(3, 1, "Fixed in commit abc123") - reply2 = _make_user_reply(4, 2, "This is by design for now") - - threads = [ - _make_thread(bot_comment1, replies=[reply1]), - _make_thread(bot_comment2, replies=[reply2]), - ] - - result = format_pr_comments_for_prompt(threads) - - assert "THREAD 1" in result - assert "THREAD 2" in result - assert "Hardcoded Password" in result - assert "N+1 Query" in result - assert "Fixed in commit" in result - assert "by design" in result - - def test_reactions_included(self): - """Test that reactions are included when provided.""" - bot_comment = _make_bot_comment(1) - threads = [_make_thread(bot_comment, reactions={'-1': 2})] - - result = format_pr_comments_for_prompt(threads) - - # Should show 2 thumbs down - assert "👎 2" in result - - def test_all_reaction_types(self): - """Test that all GitHub reaction types are converted to emojis.""" - bot_comment = _make_bot_comment(1) - reactions = { - '+1': 3, - '-1': 1, - 'laugh': 2, - 'confused': 1, - 'heart': 4, - 'hooray': 2, - 'rocket': 1, - 'eyes': 3, - } - threads = [_make_thread(bot_comment, reactions=reactions)] - - result = format_pr_comments_for_prompt(threads) - - # All reactions should be converted to emojis - assert "👍 3" in result - assert "👎 1" in result - assert "😄 2" in result - assert "😕 1" in result - assert "❤️ 4" in result - assert "🎉 2" in result - assert "🚀 1" in result - assert "👀 3" in result - - def test_truncate_long_replies(self): - """Test that very long reply threads are truncated.""" - bot_comment = _make_bot_comment(1) - - # Create 10 replies (exceeds MAX_REPLIES_PER_THREAD of 5) - replies = [ - _make_user_reply( - i + 10, 1, f"Reply {i}", - created_at=f"2024-01-15T{10+i:02d}:00:00Z" - ) - for i in range(10) - ] - - threads = [_make_thread(bot_comment, replies=replies)] - result = format_pr_comments_for_prompt(threads) - - # Should mention truncation - assert "earlier replies omitted" in result - # Should have the later replies (5-9) - assert "Reply 9" in result - assert "Reply 8" in result - - -class TestIsBotComment: - """Tests for is_bot_comment function.""" - - def test_with_marker_and_bot(self): - """Test bot comment true case.""" - bot_comment = {'body': f'{BOT_COMMENT_MARKER} Test**', 'user': {'type': 'Bot'}} - assert is_bot_comment(bot_comment) is True - - def test_without_marker_user_type(self): - """Test non-bot comment detection.""" - user_comment = {'body': 'Regular comment', 'user': {'type': 'User'}} - assert is_bot_comment(user_comment) is False - - def test_by_user_type(self): - """Test bot comment detection but not our bot.""" - bot_comment = {'body': 'Some automated message', 'user': {'type': 'Bot'}} - assert is_bot_comment(bot_comment) is False - - def test_with_marker(self): - """Test comment with marker but not from a bot.""" - bot_comment = {'body': f'{BOT_COMMENT_MARKER} Test**', 'user': {'type': 'User'}} - assert is_bot_comment(bot_comment) is False - - -class TestTruncateReplies: - """Tests for _truncate_replies function.""" - - def test_no_truncation_needed(self): - """Test when replies are under the limit.""" - bot_comment = _make_bot_comment(1) - replies = [_make_user_reply(i, 1, f"Reply {i}") for i in range(3)] - threads = [_make_thread(bot_comment, replies=replies)] - - result = _truncate_replies(threads) - - assert len(result[0]['replies']) == 3 - assert 'truncated_replies' not in result[0] - - def test_truncation_applied(self): - """Test when replies exceed the limit.""" - bot_comment = _make_bot_comment(1) - replies = [ - _make_user_reply(i, 1, f"Reply {i}", created_at=f"2024-01-15T{10+i:02d}:00:00Z") - for i in range(10) - ] - threads = [_make_thread(bot_comment, replies=replies)] - - result = _truncate_replies(threads) - - assert len(result[0]['replies']) == 5 - assert result[0]['truncated_replies'] == 5 - - -class TestParseBotComment: - """Tests for _parse_bot_comment function.""" - - def test_parse_full_comment(self): - """Test parsing structured data from bot comment.""" - body = f"""{BOT_COMMENT_MARKER} SQL Injection Vulnerability** - -**Severity:** HIGH -**Category:** security - -**Impact:** Attacker can execute arbitrary SQL queries - -**Recommendation:** Use parameterized queries -""" - result = _parse_bot_comment(body) - - assert result['title'] == "SQL Injection Vulnerability" - assert result['severity'] == "HIGH" - assert result['category'] == "security" - assert "arbitrary SQL" in result['impact'] - assert "parameterized" in result['recommendation'] - - -class TestFormatTimestamp: - """Tests for _format_timestamp function.""" - - def test_valid_timestamp(self): - """Test formatting valid ISO timestamp.""" - result = _format_timestamp("2024-01-15T10:30:00Z") - assert "2024-01-15" in result - assert "10:30" in result - - def test_invalid_timestamp(self): - """Test handling invalid timestamp.""" - result = _format_timestamp("invalid") - assert result == "invalid" - - -class TestPromptsIntegration: - """Test prompts.py integration with review context.""" - - def test_unified_prompt_with_review_context(self): - """Test that review context is included in unified prompt.""" - from claudecode.prompts import get_unified_review_prompt - - pr_data = { - "number": 123, - "title": "Test PR", - "body": "Test description", - "user": "testuser", - "changed_files": 1, - "additions": 10, - "deletions": 5, - "head": {"repo": {"full_name": "owner/repo"}}, - "files": [{"filename": "app.py"}], - } - - review_context = """ -═══════════════════════════════════════════════════════════════ -PREVIOUS REVIEW CONTEXT -═══════════════════════════════════════════════════════════════ -THREAD 1 - app.py:42 -Bot Finding (2024-01-15): SQL Injection -User Reply: This is a false positive -""" - - prompt = get_unified_review_prompt( - pr_data, - pr_diff="diff --git a/app.py b/app.py", - review_context=review_context - ) - - assert "PREVIOUS REVIEW CONTEXT" in prompt - assert "SQL Injection" in prompt - assert "false positive" in prompt - - def test_unified_prompt_without_review_context(self): - """Test that prompt works without review context.""" - from claudecode.prompts import get_unified_review_prompt - - pr_data = { - "number": 123, - "title": "Test PR", - "body": "Test description", - "user": "testuser", - "changed_files": 1, - "additions": 10, - "deletions": 5, - "head": {"repo": {"full_name": "owner/repo"}}, - "files": [{"filename": "app.py"}], - } - - prompt = get_unified_review_prompt( - pr_data, - pr_diff="diff --git a/app.py b/app.py", - review_context=None - ) - - assert "PREVIOUS REVIEW CONTEXT" not in prompt - assert "CONTEXT:" in prompt - - def test_unified_prompt_includes_pr_description(self): - """Test that PR description is included in prompt.""" - from claudecode.prompts import get_unified_review_prompt - - pr_data = { - "number": 123, - "title": "Test PR", - "body": "This PR adds important security fixes for the auth module.", - "user": "testuser", - "changed_files": 1, - "additions": 10, - "deletions": 5, - "head": {"repo": {"full_name": "owner/repo"}}, - "files": [{"filename": "auth.py"}], - } - - prompt = get_unified_review_prompt(pr_data, pr_diff="diff") - - assert "PR Description:" in prompt - assert "security fixes for the auth module" in prompt \ No newline at end of file diff --git a/claudecode/test_github_action_audit.py b/claudecode/test_github_action_audit.py index dc4a7c7..c7c8c70 100644 --- a/claudecode/test_github_action_audit.py +++ b/claudecode/test_github_action_audit.py @@ -1,527 +1,95 @@ #!/usr/bin/env python3 -""" -Pytest tests for GitHub Action audit script components. -""" +"""Pytest tests for github_action_audit module components.""" class TestImports: - """Test that all required modules can be imported.""" - def test_main_module_import(self): - """Test that the main module can be imported.""" from claudecode import github_action_audit - assert hasattr(github_action_audit, 'GitHubActionClient') - assert hasattr(github_action_audit, 'SimpleClaudeRunner') - # SimpleFindingsFilter was removed - assert hasattr(github_action_audit, 'main') - + + assert hasattr(github_action_audit, "GitHubActionClient") + assert hasattr(github_action_audit, "SimpleClaudeRunner") + assert hasattr(github_action_audit, "main") + def test_component_imports(self): - """Test that all component modules can be imported.""" - from claudecode.prompts import get_unified_review_prompt - from claudecode.json_parser import parse_json_with_fallbacks, extract_json_from_text - - # Verify they're callable/usable - assert callable(get_unified_review_prompt) + from claudecode.json_parser import extract_json_from_text, parse_json_with_fallbacks + from claudecode.prompts import build_triage_prompt + assert callable(parse_json_with_fallbacks) assert callable(extract_json_from_text) + assert callable(build_triage_prompt) class TestHardExclusionRules: - """Test the HardExclusionRules patterns.""" - def test_dos_patterns(self): - """Test DOS pattern exclusions.""" - from claudecode.findings_filter import HardExclusionRules - - dos_findings = [ - {'description': 'Potential denial of service vulnerability', 'category': 'security'}, - {'description': 'DOS attack through resource exhaustion', 'category': 'security'}, - {'description': 'Infinite loop causing resource exhaustion', 'category': 'security'}, - ] - - for finding in dos_findings: - reason = HardExclusionRules.get_exclusion_reason(finding) - assert reason is not None - assert 'dos' in reason.lower() - - def test_rate_limiting_patterns(self): - """Test rate limiting pattern exclusions.""" from claudecode.findings_filter import HardExclusionRules - - rate_limit_findings = [ - {'description': 'Missing rate limiting on endpoint', 'category': 'security'}, - {'description': 'No rate limit implemented for API', 'category': 'security'}, - {'description': 'Implement rate limiting for this route', 'category': 'security'}, - ] - - for finding in rate_limit_findings: - reason = HardExclusionRules.get_exclusion_reason(finding) - assert reason is not None - assert 'rate limit' in reason.lower() - - def test_open_redirect_patterns(self): - """Test open redirect pattern exclusions.""" - from claudecode.findings_filter import HardExclusionRules - - redirect_findings = [ - {'description': 'Open redirect vulnerability found', 'category': 'security'}, - {'description': 'Unvalidated redirect in URL parameter', 'category': 'security'}, - {'description': 'Redirect attack possible through user input', 'category': 'security'}, - ] - - for finding in redirect_findings: + + for finding in [ + {"description": "Potential denial of service vulnerability", "category": "security"}, + {"description": "DOS attack through resource exhaustion", "category": "security"}, + {"description": "Infinite loop causing resource exhaustion", "category": "security"}, + ]: reason = HardExclusionRules.get_exclusion_reason(finding) assert reason is not None - assert 'open redirect' in reason.lower() - + assert "dos" in reason.lower() + def test_markdown_file_exclusion(self): - """Test that findings in .md files are excluded.""" from claudecode.findings_filter import HardExclusionRules - - md_findings = [ - {'file': 'README.md', 'description': 'SQL injection vulnerability', 'category': 'security'}, - {'file': 'docs/security.md', 'description': 'Command injection found', 'category': 'security'}, - {'file': 'CHANGELOG.MD', 'description': 'XSS vulnerability', 'category': 'security'}, # Test case insensitive - {'file': 'path/to/file.Md', 'description': 'Path traversal', 'category': 'security'}, # Mixed case - ] - - for finding in md_findings: + + for finding in [ + {"file": "README.md", "description": "SQL injection vulnerability", "category": "security"}, + {"file": "docs/security.md", "description": "Command injection found", "category": "security"}, + ]: reason = HardExclusionRules.get_exclusion_reason(finding) assert reason is not None - assert 'markdown' in reason.lower() - - def test_non_markdown_files_not_excluded(self): - """Test that findings in non-.md files are not excluded due to file extension.""" - from claudecode.findings_filter import HardExclusionRules - - non_md_findings = [ - {'file': 'main.py', 'description': 'SQL injection vulnerability'}, - {'file': 'server.js', 'description': 'Command injection found'}, - {'file': 'index.html', 'description': 'XSS vulnerability'}, - {'file': 'config.yml', 'description': 'Hardcoded credentials'}, - {'file': 'README.txt', 'description': 'Path traversal'}, - {'file': 'file.mdx', 'description': 'Security issue'}, # Not .md - ] - - for finding in non_md_findings: - reason = HardExclusionRules.get_exclusion_reason(finding) - # Should not be excluded for being a markdown file - # (might be excluded for other reasons like DOS patterns) - if reason: - assert 'markdown' not in reason.lower() - - def test_keeps_real_vulnerabilities(self): - """Test that real vulnerabilities are not excluded.""" - from claudecode.findings_filter import HardExclusionRules - - real_vulns = [ - {'file': 'auth.py', 'description': 'SQL injection in user authentication', 'category': 'security'}, - {'file': 'exec.js', 'description': 'Command injection through user input', 'category': 'security'}, - {'file': 'comments.php', 'description': 'Cross-site scripting in comment field', 'category': 'security'}, - {'file': 'upload.go', 'description': 'Path traversal in file upload', 'category': 'security'}, - ] - - for finding in real_vulns: - reason = HardExclusionRules.get_exclusion_reason(finding) - assert reason is None + assert "markdown" in reason.lower() class TestJSONParser: - """Test JSON parsing utilities.""" - def test_parse_valid_json(self): - """Test parsing valid JSON.""" from claudecode.json_parser import parse_json_with_fallbacks - - valid_json = '{"test": "data", "number": 123}' - success, result = parse_json_with_fallbacks(valid_json, "test") - + + success, result = parse_json_with_fallbacks('{"test": "data", "number": 123}', "test") assert success is True assert result == {"test": "data", "number": 123} - - def test_parse_invalid_json(self): - """Test parsing invalid JSON.""" - from claudecode.json_parser import parse_json_with_fallbacks - - invalid_json = '{invalid json}' - success, result = parse_json_with_fallbacks(invalid_json, "test") - - assert success is False - assert 'error' in result - assert 'Invalid JSON response' in result['error'] - + def test_extract_json_from_text(self): - """Test extracting JSON from mixed text.""" - from claudecode.json_parser import extract_json_from_text - - mixed_text = 'Some text before {"key": "value"} some text after' - result = extract_json_from_text(mixed_text) - - assert result == {"key": "value"} - - def test_extract_json_from_text_no_json(self): - """Test extracting JSON when none exists.""" from claudecode.json_parser import extract_json_from_text - - plain_text = 'This is just plain text with no JSON' - result = extract_json_from_text(plain_text) - - assert result is None - - -class TestPromptsModule: - """Test the prompts module.""" - - def test_get_unified_review_prompt(self): - """Test unified review prompt generation.""" - from claudecode.prompts import get_unified_review_prompt - - pr_data = { - 'number': 123, - 'title': 'Test PR', - 'body': 'Test description', - 'user': 'testuser', - 'changed_files': 1, - 'additions': 10, - 'deletions': 5, - 'head': { - 'repo': { - 'full_name': 'owner/repo' - } - }, - 'files': [ - { - 'filename': 'test.py', - 'status': 'modified', - 'additions': 10, - 'deletions': 5, - 'patch': '@@ -1,5 +1,10 @@\n+added line' - } - ] - } - - pr_diff = "diff --git a/test.py b/test.py\n+added line" - - prompt = get_unified_review_prompt(pr_data, pr_diff) - - assert isinstance(prompt, str) - assert 'security' in prompt.lower() - assert 'PR #123' in prompt - assert 'test.py' in prompt - -class TestDeploymentPRDetection: - """Test deployment PR title pattern matching.""" - - def test_deployment_pr_patterns(self): - """Test that deployment PR titles are correctly identified.""" - import re - - deployment_pattern = r'^Deploy\s+[a-f0-9]{6,}\s+to\s+(production|staging|development|production-services)' - - # These should match - deployment_titles = [ - "Deploy 53f395b0 to production-services", - "Deploy af179b5b to production", - "Deploy 1a3cb909 to production", - "Deploy 49c09ea5 to production-services", - "Deploy 8e7acc60 to production", - "Deploy e0b1fe0b to production-services", - "Deploy c53e6010 to production", - "Deploy 42c4a061 to production", - "Deploy 9de55976 to production-services", - "deploy abcdef123456 to staging", # lowercase should work - "DEPLOY ABCDEF01 TO DEVELOPMENT", # uppercase should work - ] - - for title in deployment_titles: - assert re.match(deployment_pattern, title, re.IGNORECASE), f"Failed to match deployment PR: {title}" - - def test_non_deployment_pr_patterns(self): - """Test that non-deployment PR titles are not matched.""" - import re - - deployment_pattern = r'^Deploy\s+[a-f0-9]{6,}\s+to\s+(production|staging|development|production-services)' - - # These should NOT match - non_deployment_titles = [ - "Add new feature", - "Fix bug in deployment script", - "Update deployment documentation", - "Deploy new feature to production", # No commit hash - "Deploy abc to production", # Too short hash - "Deploy 12345g to production", # Non-hex character - "Preparing deploy af179b5b to production", # Doesn't start with Deploy - "Deploy af179b5b to testing", # Wrong environment - "Deploy af179b5b", # Missing environment - "af179b5b to production", # Missing Deploy prefix - ] - - for title in non_deployment_titles: - assert not re.match(deployment_pattern, title, re.IGNORECASE), f"Incorrectly matched non-deployment PR: {title}" + assert extract_json_from_text('Some text before {"key": "value"} some text after') == {"key": "value"} class TestBuiltinExclusions: - """Test built-in file and directory exclusions.""" - def test_builtin_excluded_directories(self): - """Test that built-in directories are in the exclusion list.""" - from claudecode.github_action_audit import GitHubActionClient - - expected_dirs = [ - 'node_modules', - 'vendor', - 'dist', - 'build', - '.next', - '__pycache__', - '.gradle', - 'Pods', - 'DerivedData', - ] - - for dir_name in expected_dirs: - assert dir_name in GitHubActionClient.BUILTIN_EXCLUDED_DIRS, f"Missing built-in excluded dir: {dir_name}" - - def test_builtin_excluded_patterns(self): - """Test that built-in file patterns are in the exclusion list.""" from claudecode.github_action_audit import GitHubActionClient - expected_patterns = [ - 'package-lock.json', - 'yarn.lock', - '*.min.js', - '*.min.css', - '*.pb.go', - '*.generated.*', - '*.png', - '*.jpg', - '*.woff2', - '*.pyc', - ] - - for pattern in expected_patterns: - assert pattern in GitHubActionClient.BUILTIN_EXCLUDED_PATTERNS, f"Missing built-in excluded pattern: {pattern}" - - def test_is_excluded_lock_files(self): - """Test that lock files are excluded.""" - from claudecode.github_action_audit import GitHubActionClient - from unittest.mock import patch - - with patch.dict('os.environ', {'GITHUB_TOKEN': 'test-token', 'EXCLUDE_DIRECTORIES': ''}): - client = GitHubActionClient() - - lock_files = [ - 'package-lock.json', - 'yarn.lock', - 'Gemfile.lock', - 'poetry.lock', - 'Cargo.lock', - 'go.sum', - 'nested/path/package-lock.json', - ] - - for filepath in lock_files: - assert client._is_excluded(filepath), f"Lock file should be excluded: {filepath}" + for dir_name in ["node_modules", "vendor", "dist", "build", ".next", "__pycache__", ".gradle", "Pods", "DerivedData"]: + assert dir_name in GitHubActionClient.BUILTIN_EXCLUDED_DIRS - def test_is_excluded_generated_files(self): - """Test that generated files are excluded.""" - from claudecode.github_action_audit import GitHubActionClient + def test_is_excluded_source_files(self): from unittest.mock import patch - with patch.dict('os.environ', {'GITHUB_TOKEN': 'test-token', 'EXCLUDE_DIRECTORIES': ''}): - client = GitHubActionClient() - - generated_files = [ - 'app.min.js', - 'styles.min.css', - 'app.bundle.js', - 'main.chunk.js', - 'api.pb.go', - 'models.generated.ts', - 'user.g.dart', - ] - - for filepath in generated_files: - assert client._is_excluded(filepath), f"Generated file should be excluded: {filepath}" - - def test_is_excluded_binary_files(self): - """Test that binary files are excluded.""" from claudecode.github_action_audit import GitHubActionClient - from unittest.mock import patch - with patch.dict('os.environ', {'GITHUB_TOKEN': 'test-token', 'EXCLUDE_DIRECTORIES': ''}): + with patch.dict("os.environ", {"GITHUB_TOKEN": "test-token", "EXCLUDE_DIRECTORIES": ""}): client = GitHubActionClient() - binary_files = [ - 'logo.png', - 'photo.jpg', - 'icon.ico', - 'font.woff2', - 'document.pdf', - 'archive.zip', - ] - - for filepath in binary_files: - assert client._is_excluded(filepath), f"Binary file should be excluded: {filepath}" + assert not client.is_excluded("src/main.py") + assert not client.is_excluded("tests/test_auth.py") - def test_is_excluded_vendor_directories(self): - """Test that vendor directories are excluded.""" - from claudecode.github_action_audit import GitHubActionClient - from unittest.mock import patch - - with patch.dict('os.environ', {'GITHUB_TOKEN': 'test-token', 'EXCLUDE_DIRECTORIES': ''}): - client = GitHubActionClient() - - vendor_paths = [ - 'node_modules/lodash/index.js', - 'vendor/github.com/pkg/errors/errors.go', - 'dist/bundle.js', - 'build/output.js', - '.next/cache/data.json', - '__pycache__/module.pyc', - 'Pods/AFNetworking/Source.m', - ] - - for filepath in vendor_paths: - assert client._is_excluded(filepath), f"Vendor path should be excluded: {filepath}" - - def test_is_not_excluded_source_files(self): - """Test that regular source files are NOT excluded.""" - from claudecode.github_action_audit import GitHubActionClient + def test_user_exclusions_combined_with_builtin(self): from unittest.mock import patch - with patch.dict('os.environ', {'GITHUB_TOKEN': 'test-token', 'EXCLUDE_DIRECTORIES': ''}): - client = GitHubActionClient() - - source_files = [ - 'src/main.py', - 'lib/utils.js', - 'app/models/user.rb', - 'pkg/handler/api.go', - 'src/components/Button.tsx', - 'tests/test_auth.py', - ] - - for filepath in source_files: - assert not client._is_excluded(filepath), f"Source file should NOT be excluded: {filepath}" - - def test_user_exclusions_combined_with_builtin(self): - """Test that user exclusions are combined with built-in exclusions.""" from claudecode.github_action_audit import GitHubActionClient - from unittest.mock import patch - with patch.dict('os.environ', {'GITHUB_TOKEN': 'test-token', 'EXCLUDE_DIRECTORIES': 'custom_dir,my_vendor'}): + with patch.dict("os.environ", {"GITHUB_TOKEN": "test-token", "EXCLUDE_DIRECTORIES": "custom_dir,my_vendor"}): client = GitHubActionClient() - # Built-in should still work - assert client._is_excluded('node_modules/pkg/index.js') - assert client._is_excluded('vendor/lib/code.go') - - # User exclusions should also work - assert client._is_excluded('custom_dir/file.py') - assert client._is_excluded('my_vendor/lib.js') - - -class TestExtractReviewFindings: - """Test the _extract_review_findings method.""" - - def test_extract_findings_with_pr_summary(self): - """Test that pr_summary is extracted from Claude output.""" - import json - from claudecode.github_action_audit import SimpleClaudeRunner - - runner = SimpleClaudeRunner() - - claude_output = { - 'result': json.dumps({ - 'findings': [ - {'file': 'test.py', 'line': 1, 'severity': 'HIGH', 'description': 'Test finding'} - ], - 'pr_summary': { - 'overview': 'This PR adds authentication middleware.', - 'file_changes': [ - {'label': 'src/auth.py', 'files': ['src/auth.py'], 'changes': 'Added JWT validation'} - ] - } - }) - } - - result = runner._extract_review_findings(claude_output) - - assert 'pr_summary' in result - assert result['pr_summary']['overview'] == 'This PR adds authentication middleware.' - assert len(result['pr_summary']['file_changes']) == 1 - assert result['pr_summary']['file_changes'][0]['files'] == ['src/auth.py'] - assert len(result['findings']) == 1 - - def test_extract_findings_without_pr_summary(self): - """Test that missing pr_summary defaults to empty dict.""" - import json - from claudecode.github_action_audit import SimpleClaudeRunner - - runner = SimpleClaudeRunner() - - claude_output = { - 'result': json.dumps({ - 'findings': [] - }) - } - - result = runner._extract_review_findings(claude_output) - - assert 'pr_summary' in result - assert result['pr_summary'] == {} - - def test_extract_findings_empty_result(self): - """Test extraction with invalid/empty Claude output.""" - from claudecode.github_action_audit import SimpleClaudeRunner - - runner = SimpleClaudeRunner() - - result = runner._extract_review_findings({}) - - assert 'pr_summary' in result - assert result['pr_summary'] == {} - assert result['findings'] == [] - - def test_extract_findings_with_grouped_files(self): - """Test that grouped file_changes are handled correctly.""" - import json - from claudecode.github_action_audit import SimpleClaudeRunner - - runner = SimpleClaudeRunner() - - claude_output = { - 'result': json.dumps({ - 'findings': [], - 'pr_summary': { - 'overview': 'Test updates.', - 'file_changes': [ - { - 'label': 'tests/test_*.py', - 'files': ['tests/test_auth.py', 'tests/test_login.py', 'tests/test_session.py'], - 'changes': 'Unit tests for auth module' - } - ] - } - }) - } - - result = runner._extract_review_findings(claude_output) - - assert 'pr_summary' in result - file_changes = result['pr_summary']['file_changes'] - assert len(file_changes) == 1 - assert file_changes[0]['label'] == 'tests/test_*.py' - assert len(file_changes[0]['files']) == 3 + assert client.is_excluded("node_modules/pkg/index.js") + assert client.is_excluded("custom_dir/file.py") class TestDiffSizeLimits: - """Test diff size limit functionality.""" - def test_diff_line_counting(self): - """Test that diff lines are counted correctly.""" sample_diff = """diff --git a/file.py b/file.py --- a/file.py +++ b/file.py @@ -534,34 +102,4 @@ def test_diff_line_counting(self): +replaced line 5 line 6""" - line_count = len(sample_diff.splitlines()) - assert line_count == 11 # Count the actual lines - - def test_max_diff_lines_env_parsing(self): - """Test that MAX_DIFF_LINES environment variable is parsed correctly.""" - import os - from unittest.mock import patch - - # Test default value when env var is not set - with patch.dict('os.environ', {}, clear=False): - os.environ.pop('MAX_DIFF_LINES', None) - max_lines_str = os.environ.get('MAX_DIFF_LINES', '5000') - try: - max_lines = int(max_lines_str) - except ValueError: - max_lines = 5000 - - assert max_lines == 5000 # Default when not set - - def test_max_diff_lines_zero_forces_agentic_mode(self): - """Test that setting MAX_DIFF_LINES to 0 forces agentic file reading mode.""" - import os - from unittest.mock import patch - - with patch.dict('os.environ', {'MAX_DIFF_LINES': '0'}): - max_lines_str = os.environ.get('MAX_DIFF_LINES', '5000') - max_lines = int(max_lines_str) - - # When max_lines is 0, agentic mode is always used - assert max_lines == 0 - # In the actual code: use_agentic_mode = max_diff_lines == 0 or ... + assert len(sample_diff.splitlines()) == 11 diff --git a/claudecode/test_helper_functions.py b/claudecode/test_helper_functions.py index ad60728..e124ec8 100644 --- a/claudecode/test_helper_functions.py +++ b/claudecode/test_helper_functions.py @@ -9,10 +9,8 @@ get_environment_config, initialize_clients, initialize_findings_filter, - run_code_review, apply_findings_filter, ConfigurationError, - AuditError ) from claudecode.findings_filter import FindingsFilter @@ -145,34 +143,6 @@ def test_initialize_findings_filter_with_defaults(self, mock_simple_filter): assert result == mock_filter_instance - def test_run_code_review_success(self): - """Test successful code review execution.""" - mock_runner = MagicMock() - mock_runner.run_code_review.return_value = ( - True, - "", - {"findings": [{"id": 1}], "analysis_summary": {}} - ) - - result = run_code_review(mock_runner, "test prompt") - - assert result == {"findings": [{"id": 1}], "analysis_summary": {}} - mock_runner.run_code_review.assert_called_once() - - def test_run_code_review_failure(self): - """Test code review execution failure.""" - mock_runner = MagicMock() - mock_runner.run_code_review.return_value = ( - False, - "Audit failed: timeout", - {} - ) - - with pytest.raises(AuditError) as exc_info: - run_code_review(mock_runner, "test prompt") - - assert "Code review failed: Audit failed: timeout" in str(exc_info.value) - def test_apply_findings_filter_with_findings_filter(self): """Test applying FindingsFilter to findings.""" mock_filter = MagicMock(spec=FindingsFilter) diff --git a/claudecode/test_main_function.py b/claudecode/test_main_function.py index 0b2bdae..7c46ec0 100644 --- a/claudecode/test_main_function.py +++ b/claudecode/test_main_function.py @@ -1,191 +1,50 @@ #!/usr/bin/env python3 -""" -Unit tests for main function and full workflow. -""" +"""Unit tests for main function and multi-phase workflow.""" -import pytest import json import os -from unittest.mock import Mock, patch from pathlib import Path +from unittest.mock import Mock, patch + +import pytest from claudecode.github_action_audit import main -class TestMainFunction: - """Test main function execution flow.""" +def _mock_multiphase_success(mock_runner, findings=None): + findings = findings or [] + mock_runner.run_prompt.side_effect = [ + (True, "", {"skip_review": False, "reason": "", "risk_level": "medium"}), + (True, "", {"claude_md_files": [], "change_summary": "", "hotspots": [], "priority_files": []}), + (True, "", {"findings": findings}), + (True, "", {"findings": []}), + (True, "", {"findings": []}), + ( + True, + "", + { + "validated_findings": [ + {"finding_index": idx, "keep": True, "confidence": 0.95, "reason": "valid"} + for idx in range(len(findings)) + ] + }, + ), + ] + +class TestMainFunction: def test_main_missing_environment_vars(self, capsys): - """Test main with missing environment variables.""" with patch.dict(os.environ, {}, clear=True): with pytest.raises(SystemExit) as exc_info: main() - assert exc_info.value.code == 2 # EXIT_CONFIGURATION_ERROR - - captured = capsys.readouterr() - output = json.loads(captured.out) - assert 'error' in output - assert 'GITHUB_REPOSITORY' in output['error'] - - def test_main_missing_pr_number(self, capsys): - """Test main with missing PR number.""" - with patch.dict(os.environ, {'GITHUB_REPOSITORY': 'owner/repo'}, clear=True): - with pytest.raises(SystemExit) as exc_info: - main() - - assert exc_info.value.code == 2 # EXIT_CONFIGURATION_ERROR - - captured = capsys.readouterr() - output = json.loads(captured.out) - assert 'PR_NUMBER' in output['error'] - - def test_main_invalid_pr_number(self, capsys): - """Test main with invalid PR number.""" - with patch.dict(os.environ, { - 'GITHUB_REPOSITORY': 'owner/repo', - 'PR_NUMBER': 'not-a-number' - }, clear=True): - with pytest.raises(SystemExit) as exc_info: - main() - - assert exc_info.value.code == 2 # EXIT_CONFIGURATION_ERROR - - captured = capsys.readouterr() - output = json.loads(captured.out) - assert 'Invalid PR_NUMBER' in output['error'] - - @patch('claudecode.github_action_audit.GitHubActionClient') - def test_main_github_client_init_failure(self, mock_client_class, capsys): - """Test main when GitHub client initialization fails.""" - mock_client_class.side_effect = Exception("Token invalid") - - with patch.dict(os.environ, { - 'GITHUB_REPOSITORY': 'owner/repo', - 'PR_NUMBER': '123', - 'GITHUB_TOKEN': 'invalid' - }): - with pytest.raises(SystemExit) as exc_info: - main() - - assert exc_info.value.code == 2 # EXIT_CONFIGURATION_ERROR - - captured = capsys.readouterr() - output = json.loads(captured.out) - assert 'Failed to initialize GitHub client' in output['error'] - assert 'Token invalid' in output['error'] - - @patch('claudecode.github_action_audit.SimpleClaudeRunner') - @patch('claudecode.github_action_audit.GitHubActionClient') - def test_main_claude_runner_init_failure(self, mock_client_class, mock_runner_class, capsys): - """Test main when Claude runner initialization fails.""" - mock_client_class.return_value = Mock() - mock_runner_class.side_effect = Exception("Runner error") - - with patch.dict(os.environ, { - 'GITHUB_REPOSITORY': 'owner/repo', - 'PR_NUMBER': '123', - 'GITHUB_TOKEN': 'test-token' - }): - with pytest.raises(SystemExit) as exc_info: - main() - - assert exc_info.value.code == 2 # EXIT_CONFIGURATION_ERROR - - captured = capsys.readouterr() - output = json.loads(captured.out) - assert 'Failed to initialize Claude runner' in output['error'] - - @patch('claudecode.github_action_audit.FindingsFilter') - @patch('claudecode.github_action_audit.SimpleClaudeRunner') - @patch('claudecode.github_action_audit.GitHubActionClient') - def test_main_filter_initialization(self, mock_client_class, mock_runner_class, - mock_full_filter_class): - """Test filter initialization logic.""" - # Setup mocks - mock_client = Mock() - mock_client_class.return_value = mock_client - - mock_runner = Mock() - mock_runner.validate_claude_available.return_value = (False, "Not available") - mock_runner_class.return_value = mock_runner - - # Test with full filtering enabled - with patch.dict(os.environ, { - 'GITHUB_REPOSITORY': 'owner/repo', - 'PR_NUMBER': '123', - 'GITHUB_TOKEN': 'test-token', - 'ENABLE_CLAUDE_FILTERING': 'true', - 'ENABLE_HEURISTIC_FILTERING': 'true', - 'ANTHROPIC_API_KEY': 'test-api-key' - }): - with pytest.raises(SystemExit): - main() - - # Should initialize full filter - mock_full_filter_class.assert_called_once() - call_kwargs = mock_full_filter_class.call_args[1] - assert call_kwargs['use_hard_exclusions'] is True - assert call_kwargs['use_claude_filtering'] is True - assert call_kwargs['api_key'] == 'test-api-key' - - # Reset mocks - mock_full_filter_class.reset_mock() - - # Test with filtering disabled - with patch.dict(os.environ, { - 'GITHUB_REPOSITORY': 'owner/repo', - 'PR_NUMBER': '123', - 'GITHUB_TOKEN': 'test-token', - 'ENABLE_HEURISTIC_FILTERING': 'true', - 'ENABLE_CLAUDE_FILTERING': 'false' - }): - with pytest.raises(SystemExit): - main() - - # Should use FindingsFilter with hard exclusions only - mock_full_filter_class.assert_called_once() - call_kwargs = mock_full_filter_class.call_args[1] - assert call_kwargs['use_hard_exclusions'] is True - assert call_kwargs['use_claude_filtering'] is False - - @patch('claudecode.github_action_audit.FindingsFilter') - @patch('claudecode.github_action_audit.SimpleClaudeRunner') - @patch('claudecode.github_action_audit.GitHubActionClient') - def test_main_claude_not_available(self, mock_client_class, mock_runner_class, - mock_filter_class, capsys): - """Test when Claude is not available.""" - mock_client = Mock() - mock_client_class.return_value = mock_client - - mock_runner = Mock() - mock_runner.validate_claude_available.return_value = (False, "Claude not installed") - mock_runner_class.return_value = mock_runner - - mock_filter_class.return_value = Mock() - - with patch.dict(os.environ, { - 'GITHUB_REPOSITORY': 'owner/repo', - 'PR_NUMBER': '123', - 'GITHUB_TOKEN': 'test-token' - }): - with pytest.raises(SystemExit) as exc_info: - main() - - assert exc_info.value.code == 1 # Claude not available, exit 1 - - captured = capsys.readouterr() - output = json.loads(captured.out) - assert 'Claude Code not available' in output['error'] - assert 'Claude not installed' in output['error'] + assert exc_info.value.code == 2 + output = json.loads(capsys.readouterr().out) + assert "GITHUB_REPOSITORY" in output["error"] - @patch('claudecode.github_action_audit.get_unified_review_prompt') - @patch('claudecode.github_action_audit.FindingsFilter') - @patch('claudecode.github_action_audit.SimpleClaudeRunner') - @patch('claudecode.github_action_audit.GitHubActionClient') - def test_main_pr_data_fetch_failure(self, mock_client_class, mock_runner_class, - mock_filter_class, mock_prompt_func, capsys): - """Test when PR data fetch fails.""" + @patch("claudecode.github_action_audit.SimpleClaudeRunner") + @patch("claudecode.github_action_audit.GitHubActionClient") + def test_main_pr_data_fetch_failure(self, mock_client_class, mock_runner_class, capsys): mock_client = Mock() mock_client.get_pr_data.side_effect = Exception("API error") mock_client_class.return_value = mock_client @@ -194,490 +53,112 @@ def test_main_pr_data_fetch_failure(self, mock_client_class, mock_runner_class, mock_runner.validate_claude_available.return_value = (True, "") mock_runner_class.return_value = mock_runner - mock_filter_class.return_value = Mock() - - with patch.dict(os.environ, { - 'GITHUB_REPOSITORY': 'owner/repo', - 'PR_NUMBER': '123', - 'GITHUB_TOKEN': 'test-token' - }): + with patch.dict(os.environ, {"GITHUB_REPOSITORY": "owner/repo", "PR_NUMBER": "123", "GITHUB_TOKEN": "test-token"}): with pytest.raises(SystemExit) as exc_info: main() - assert exc_info.value.code == 1 # API error, exit 1 - - captured = capsys.readouterr() - output = json.loads(captured.out) - assert 'Failed to fetch PR data' in output['error'] - assert 'API error' in output['error'] + assert exc_info.value.code == 1 + output = json.loads(capsys.readouterr().out) + assert "Failed to fetch PR data" in output["error"] - @patch('pathlib.Path.cwd') - @patch('claudecode.github_action_audit.get_unified_review_prompt') - @patch('claudecode.github_action_audit.FindingsFilter') - @patch('claudecode.github_action_audit.SimpleClaudeRunner') - @patch('claudecode.github_action_audit.GitHubActionClient') - def test_main_successful_audit_no_findings(self, mock_client_class, mock_runner_class, - mock_filter_class, mock_prompt_func, - mock_cwd, capsys): - """Test successful audit with no findings.""" - # Setup mocks + @patch("pathlib.Path.cwd") + @patch("claudecode.github_action_audit.FindingsFilter") + @patch("claudecode.github_action_audit.SimpleClaudeRunner") + @patch("claudecode.github_action_audit.GitHubActionClient") + def test_main_successful_audit_no_findings(self, mock_client_class, mock_runner_class, mock_filter_class, mock_cwd, capsys): mock_client = Mock() - mock_client.get_pr_data.return_value = { - 'number': 123, - 'title': 'Test PR', - 'body': 'Description' - } + mock_client.get_pr_data.return_value = {"number": 123, "title": "Test PR", "body": "Description"} mock_client.get_pr_diff.return_value = "diff content" + mock_client.is_excluded.return_value = False mock_client_class.return_value = mock_client mock_runner = Mock() mock_runner.validate_claude_available.return_value = (True, "") - # Unified review - single call - mock_runner.run_code_review.return_value = ( - True, - "", - { - 'findings': [], - 'analysis_summary': { - 'files_reviewed': 5, - 'high_severity': 0, - 'medium_severity': 0, - 'low_severity': 0 - } - } - ) + _mock_multiphase_success(mock_runner, findings=[]) mock_runner_class.return_value = mock_runner mock_filter = Mock() - mock_filter.filter_findings.return_value = ( - True, - { - 'filtered_findings': [], - 'excluded_findings': [], - 'analysis_summary': {} - }, - Mock() # filter_stats - ) + mock_filter.filter_findings.return_value = (True, {"filtered_findings": [], "excluded_findings": [], "analysis_summary": {}}, Mock()) mock_filter_class.return_value = mock_filter - mock_prompt_func.return_value = "unified review prompt" - mock_cwd.return_value = Path('/tmp/repo') + mock_cwd.return_value = Path("/tmp/repo") - with patch.dict(os.environ, { - 'GITHUB_REPOSITORY': 'owner/repo', - 'PR_NUMBER': '123', - 'GITHUB_TOKEN': 'test-token' - }): + with patch.dict(os.environ, {"GITHUB_REPOSITORY": "owner/repo", "PR_NUMBER": "123", "GITHUB_TOKEN": "test-token"}): with pytest.raises(SystemExit) as exc_info: main() - assert exc_info.value.code == 0 # No findings, exit 0 - - captured = capsys.readouterr() - output = json.loads(captured.out) - - assert output['pr_number'] == 123 - assert output['repo'] == 'owner/repo' - assert len(output['findings']) == 0 - assert output['filtering_summary']['total_original_findings'] == 0 - - @patch('pathlib.Path.cwd') - @patch('claudecode.github_action_audit.get_unified_review_prompt') - @patch('claudecode.github_action_audit.FindingsFilter') - @patch('claudecode.github_action_audit.SimpleClaudeRunner') - @patch('claudecode.github_action_audit.GitHubActionClient') - def test_main_successful_audit_with_findings(self, mock_client_class, mock_runner_class, - mock_filter_class, mock_prompt_func, - mock_cwd, capsys): - """Test successful audit with high severity findings.""" - # Setup mocks + assert exc_info.value.code == 0 + output = json.loads(capsys.readouterr().out) + assert output["pr_number"] == 123 + assert output["repo"] == "owner/repo" + assert output["findings"] == [] + + @patch("pathlib.Path.cwd") + @patch("claudecode.github_action_audit.FindingsFilter") + @patch("claudecode.github_action_audit.SimpleClaudeRunner") + @patch("claudecode.github_action_audit.GitHubActionClient") + def test_main_successful_audit_with_high_finding(self, mock_client_class, mock_runner_class, mock_filter_class, mock_cwd, capsys): mock_client = Mock() - mock_client.get_pr_data.return_value = { - 'number': 123, - 'title': 'Test PR', - 'body': 'Description' - } + mock_client.get_pr_data.return_value = {"number": 123, "title": "Test PR", "body": "Description"} mock_client.get_pr_diff.return_value = "diff content" - mock_client._is_excluded.return_value = False # Don't exclude any files in tests + mock_client.is_excluded.return_value = False mock_client_class.return_value = mock_client findings = [ { - 'file': 'test.py', - 'line': 10, - 'severity': 'HIGH', - 'category': 'security', - 'description': 'SQL injection' - }, - { - 'file': 'main.py', - 'line': 20, - 'severity': 'MEDIUM', - 'category': 'correctness', - 'description': 'Logic error' + "file": "test.py", + "line": 10, + "severity": "HIGH", + "category": "security", + "description": "SQL injection", } ] mock_runner = Mock() mock_runner.validate_claude_available.return_value = (True, "") - # Unified review - single call with all findings - mock_runner.run_code_review.return_value = ( - True, - "", - { - 'findings': findings, - 'analysis_summary': { - 'files_reviewed': 2, - 'high_severity': 1, - 'medium_severity': 1, - 'low_severity': 0 - } - } - ) + _mock_multiphase_success(mock_runner, findings=findings) mock_runner_class.return_value = mock_runner - # Filter keeps only high severity mock_filter = Mock() - mock_filter.filter_findings.return_value = ( - True, - { - 'filtered_findings': [findings[0]], - 'excluded_findings': [findings[1]], - 'analysis_summary': { - 'total_findings': 2, - 'kept_findings': 1, - 'excluded_findings': 1 - } - }, - Mock() # filter_stats - ) + mock_filter.filter_findings.return_value = (True, {"filtered_findings": findings, "excluded_findings": [], "analysis_summary": {}}, Mock()) mock_filter_class.return_value = mock_filter - mock_prompt_func.return_value = "unified review prompt" - mock_cwd.return_value = Path('/tmp/repo') + mock_cwd.return_value = Path("/tmp/repo") - with patch.dict(os.environ, { - 'GITHUB_REPOSITORY': 'owner/repo', - 'PR_NUMBER': '123', - 'GITHUB_TOKEN': 'test-token' - }): + with patch.dict(os.environ, {"GITHUB_REPOSITORY": "owner/repo", "PR_NUMBER": "123", "GITHUB_TOKEN": "test-token"}): with pytest.raises(SystemExit) as exc_info: main() - assert exc_info.value.code == 1 # High severity finding, exit 1 - - captured = capsys.readouterr() - output = json.loads(captured.out) - - assert len(output['findings']) == 1 - assert output['findings'][0]['severity'] == 'HIGH' - assert output['filtering_summary']['total_original_findings'] == 2 - assert output['filtering_summary']['excluded_findings'] == 1 - assert output['filtering_summary']['kept_findings'] == 1 - - @patch('pathlib.Path.cwd') - @patch('claudecode.github_action_audit.get_unified_review_prompt') - @patch('claudecode.github_action_audit.SimpleClaudeRunner') - @patch('claudecode.github_action_audit.GitHubActionClient') - def test_main_with_full_filter(self, mock_client_class, mock_runner_class, - mock_prompt_func, mock_cwd, capsys): - """Test main with full FindingsFilter (LLM-based).""" - # Setup mocks - mock_client = Mock() - mock_client.get_pr_data.return_value = { - 'number': 123, - 'title': 'Test PR', - 'body': 'Description' - } - mock_client.get_pr_diff.return_value = "diff content" - mock_client._is_excluded.return_value = False # Don't exclude any files in tests - mock_client_class.return_value = mock_client - - findings = [{'file': 'test.py', 'line': 10, 'severity': 'HIGH', 'description': 'Issue'}] - - mock_runner = Mock() - mock_runner.validate_claude_available.return_value = (True, "") - mock_runner.run_code_review.return_value = (True, "", {'findings': findings}) - mock_runner_class.return_value = mock_runner - - mock_prompt_func.return_value = "prompt" - mock_cwd.return_value = Path('/tmp') - - # Mock FindingsFilter to return findings properly - mock_filter = Mock() - mock_filter.filter_findings.return_value = ( - True, - { - 'filtered_findings': findings, - 'excluded_findings': [], - 'analysis_summary': { - 'total_findings': 1, - 'kept_findings': 1, - 'excluded_findings': 0 - } - }, - Mock() # filter_stats - ) - - with patch('claudecode.github_action_audit.FindingsFilter', return_value=mock_filter): - with patch.dict(os.environ, { - 'GITHUB_REPOSITORY': 'owner/repo', - 'PR_NUMBER': '123', - 'GITHUB_TOKEN': 'test-token', - 'ENABLE_HEURISTIC_FILTERING': 'true', - 'ENABLE_CLAUDE_FILTERING': 'false', # Use simple filter - }): - with pytest.raises(SystemExit): - main() - - captured = capsys.readouterr() - output = json.loads(captured.out) - - # Check that we got successful results - assert 'findings' in output - assert len(output['findings']) == 1 - assert output['findings'][0]['severity'] == 'HIGH' - - @patch('pathlib.Path.cwd') - @patch('claudecode.github_action_audit.get_unified_review_prompt') - @patch('claudecode.github_action_audit.SimpleClaudeRunner') - @patch('claudecode.github_action_audit.GitHubActionClient') - def test_main_filter_failure_keeps_all_findings(self, mock_client_class, mock_runner_class, - mock_prompt_func, mock_cwd, capsys): - """Test that filter failure keeps all findings with SimpleFindingsFilter.""" - # Setup mocks - mock_client = Mock() - mock_client.get_pr_data.return_value = {'number': 123, 'title': 'Test', 'body': ''} - mock_client.get_pr_diff.return_value = "diff" - mock_client._is_excluded.return_value = False # Don't exclude any files in tests - mock_client_class.return_value = mock_client - - findings = [ - {'file': 'a.py', 'line': 1, 'severity': 'HIGH', 'description': 'Issue 1'}, - {'file': 'b.py', 'line': 2, 'severity': 'HIGH', 'description': 'Issue 2'} - ] - - mock_runner = Mock() - mock_runner.validate_claude_available.return_value = (True, "") - mock_runner.run_code_review.return_value = (True, "", {'findings': findings}) - mock_runner_class.return_value = mock_runner - - mock_prompt_func.return_value = "prompt" - mock_cwd.return_value = Path('/tmp') - - # Mock FindingsFilter to keep all findings - mock_filter = Mock() - mock_filter.filter_findings.return_value = ( - True, - { - 'filtered_findings': findings, - 'excluded_findings': [], - 'analysis_summary': { - 'total_findings': 2, - 'kept_findings': 2, - 'excluded_findings': 0 - } - }, - Mock() # filter_stats - ) - - with patch('claudecode.github_action_audit.FindingsFilter', return_value=mock_filter): - with patch.dict(os.environ, { - 'GITHUB_REPOSITORY': 'owner/repo', - 'PR_NUMBER': '123', - 'GITHUB_TOKEN': 'test-token', - 'ENABLE_HEURISTIC_FILTERING': 'true', - 'ENABLE_CLAUDE_FILTERING': 'false', # Use simple filter - }): - with pytest.raises(SystemExit) as exc_info: - main() - - assert exc_info.value.code == 1 # Has HIGH findings - - captured = capsys.readouterr() - output = json.loads(captured.out) - - # All findings should be kept - assert len(output['findings']) == 2 - - def test_main_unexpected_error(self, capsys): - """Test unexpected error handling.""" - with patch('claudecode.github_action_audit.GitHubActionClient') as mock_class: - mock_class.side_effect = Exception("Unexpected error!") - - with patch.dict(os.environ, { - 'GITHUB_REPOSITORY': 'owner/repo', - 'PR_NUMBER': '123', - 'GITHUB_TOKEN': 'test-token' - }): - with pytest.raises(SystemExit) as exc_info: - main() - - assert exc_info.value.code == 2 # EXIT_CONFIGURATION_ERROR - - captured = capsys.readouterr() - output = json.loads(captured.out) - assert 'Unexpected error' in output['error'] + assert exc_info.value.code == 1 + output = json.loads(capsys.readouterr().out) + assert len(output["findings"]) == 1 + assert output["findings"][0]["review_type"] == "security" class TestAuditFailureModes: - """Test various audit failure scenarios.""" - - @patch('pathlib.Path.cwd') - @patch('claudecode.github_action_audit.get_unified_review_prompt') - @patch('claudecode.github_action_audit.FindingsFilter') - @patch('claudecode.github_action_audit.SimpleClaudeRunner') - @patch('claudecode.github_action_audit.GitHubActionClient') - def test_audit_failure(self, mock_client_class, mock_runner_class, - mock_filter_class, mock_prompt_func, - mock_cwd, capsys): - """Test when code review fails.""" + @patch("pathlib.Path.cwd") + @patch("claudecode.github_action_audit.FindingsFilter") + @patch("claudecode.github_action_audit.SimpleClaudeRunner") + @patch("claudecode.github_action_audit.GitHubActionClient") + def test_audit_failure(self, mock_client_class, mock_runner_class, mock_filter_class, mock_cwd, capsys): mock_client = Mock() - mock_client.get_pr_data.return_value = {'number': 123} + mock_client.get_pr_data.return_value = {"number": 123} mock_client.get_pr_diff.return_value = "diff" mock_client_class.return_value = mock_client mock_runner = Mock() mock_runner.validate_claude_available.return_value = (True, "") - mock_runner.run_code_review.return_value = ( - False, - "Claude execution failed", - {} - ) + mock_runner.run_prompt.return_value = (False, "Claude execution failed", {}) mock_runner_class.return_value = mock_runner mock_filter_class.return_value = Mock() - mock_prompt_func.return_value = "prompt" - mock_cwd.return_value = Path('/tmp') - - with patch.dict(os.environ, { - 'GITHUB_REPOSITORY': 'owner/repo', - 'PR_NUMBER': '123', - 'GITHUB_TOKEN': 'test-token' - }): - with pytest.raises(SystemExit) as exc_info: - main() + mock_cwd.return_value = Path("/tmp") - assert exc_info.value.code == 1 # Audit failure, exit 1 - - captured = capsys.readouterr() - output = json.loads(captured.out) - assert 'Code review failed' in output['error'] - assert 'Claude execution failed' in output['error'] - - -class TestReviewTypeMetadata: - """Test that findings are tagged with correct review_type based on category.""" - - @patch('pathlib.Path.cwd') - @patch('claudecode.github_action_audit.get_unified_review_prompt') - @patch('claudecode.github_action_audit.FindingsFilter') - @patch('claudecode.github_action_audit.SimpleClaudeRunner') - @patch('claudecode.github_action_audit.GitHubActionClient') - def test_findings_have_correct_review_type(self, mock_client_class, mock_runner_class, - mock_filter_class, mock_prompt_func, - mock_cwd, capsys): - """Test that findings get review_type based on category.""" - mock_client = Mock() - mock_client.get_pr_data.return_value = {'number': 123, 'title': 'Test', 'body': ''} - mock_client.get_pr_diff.return_value = "diff" - mock_client._is_excluded.return_value = False - mock_client_class.return_value = mock_client - - # Mixed findings - security and non-security - findings = [ - {'file': 'test.py', 'line': 1, 'severity': 'MEDIUM', 'category': 'correctness', 'description': 'Logic issue'}, - {'file': 'auth.py', 'line': 5, 'severity': 'HIGH', 'category': 'security', 'description': 'Security issue'} - ] - - mock_runner = Mock() - mock_runner.validate_claude_available.return_value = (True, "") - mock_runner.run_code_review.return_value = (True, "", {'findings': findings}) - mock_runner_class.return_value = mock_runner - - # Filter passes through all findings - mock_filter = Mock() - mock_filter.filter_findings.return_value = ( - True, - {'filtered_findings': findings, 'excluded_findings': [], 'analysis_summary': {}}, - Mock() - ) - mock_filter_class.return_value = mock_filter - - mock_prompt_func.return_value = "unified prompt" - mock_cwd.return_value = Path('/tmp') - - with patch.dict(os.environ, { - 'GITHUB_REPOSITORY': 'owner/repo', - 'PR_NUMBER': '123', - 'GITHUB_TOKEN': 'test-token' - }): + with patch.dict(os.environ, {"GITHUB_REPOSITORY": "owner/repo", "PR_NUMBER": "123", "GITHUB_TOKEN": "test-token"}): with pytest.raises(SystemExit) as exc_info: main() - assert exc_info.value.code == 1 # Has HIGH finding - - captured = capsys.readouterr() - output = json.loads(captured.out) - - # Check that we have findings with correct review_type based on category - assert len(output['findings']) == 2 - - # Find the general finding (correctness category) and verify review_type - general_finding = next(f for f in output['findings'] if f['category'] == 'correctness') - assert general_finding['review_type'] == 'general' - - # Find the security finding and verify review_type - security_finding = next(f for f in output['findings'] if f['category'] == 'security') - assert security_finding['review_type'] == 'security' - - @patch('pathlib.Path.cwd') - @patch('claudecode.github_action_audit.get_unified_review_prompt') - @patch('claudecode.github_action_audit.FindingsFilter') - @patch('claudecode.github_action_audit.SimpleClaudeRunner') - @patch('claudecode.github_action_audit.GitHubActionClient') - def test_review_type_not_overwritten_if_already_set(self, mock_client_class, mock_runner_class, - mock_filter_class, mock_prompt_func, - mock_cwd, capsys): - """Test that review_type is not overwritten if finding already has it.""" - mock_client = Mock() - mock_client.get_pr_data.return_value = {'number': 123, 'title': 'Test', 'body': ''} - mock_client.get_pr_diff.return_value = "diff" - mock_client._is_excluded.return_value = False - mock_client_class.return_value = mock_client - - # Finding already has review_type set (e.g., from a custom analyzer) - findings = [ - {'file': 'test.py', 'line': 1, 'severity': 'MEDIUM', 'category': 'correctness', - 'description': 'Issue', 'review_type': 'custom'} - ] - - mock_runner = Mock() - mock_runner.validate_claude_available.return_value = (True, "") - mock_runner.run_code_review.return_value = (True, "", {'findings': findings}) - mock_runner_class.return_value = mock_runner - - mock_filter = Mock() - mock_filter.filter_findings.return_value = ( - True, - {'filtered_findings': findings, 'excluded_findings': [], 'analysis_summary': {}}, - Mock() - ) - mock_filter_class.return_value = mock_filter - - mock_prompt_func.return_value = "unified prompt" - mock_cwd.return_value = Path('/tmp') - - with patch.dict(os.environ, { - 'GITHUB_REPOSITORY': 'owner/repo', - 'PR_NUMBER': '123', - 'GITHUB_TOKEN': 'test-token' - }): - with pytest.raises(SystemExit): - main() - - captured = capsys.readouterr() - output = json.loads(captured.out) - - # The existing review_type should be preserved (setdefault doesn't overwrite) - assert output['findings'][0]['review_type'] == 'custom' + assert exc_info.value.code == 1 + output = json.loads(capsys.readouterr().out) + assert "Code review failed" in output["error"] + assert "Claude execution failed" in output["error"] diff --git a/claudecode/test_prompts.py b/claudecode/test_prompts.py index dbb9d4f..6a1404b 100644 --- a/claudecode/test_prompts.py +++ b/claudecode/test_prompts.py @@ -1,6 +1,14 @@ -"""Unit tests for the prompts module.""" +"""Unit tests for multi-phase prompt generation.""" -from claudecode.prompts import get_unified_review_prompt +from claudecode.prompts import ( + _build_hybrid_diff_section, + build_compliance_prompt, + build_context_discovery_prompt, + build_quality_prompt, + build_security_prompt, + build_triage_prompt, + build_validation_prompt, +) def _sample_pr_data(): @@ -12,184 +20,64 @@ def _sample_pr_data(): "changed_files": 1, "additions": 10, "deletions": 5, - "head": { - "repo": { - "full_name": "owner/repo" - } - }, - "files": [ - { - "filename": "app.py", - "status": "modified", - "additions": 10, - "deletions": 5, - } - ], + "head": {"repo": {"full_name": "owner/repo"}}, + "files": [{"filename": "app.py"}], } -class TestPrompts: - """Test unified prompt generation.""" - - def test_get_unified_review_prompt_basic(self): - pr_data = _sample_pr_data() - - pr_diff = """ -diff --git a/app.py b/app.py -@@ -1,5 +1,10 @@ - def process_input(user_input): -- return user_input -+ # Process the input -+ result = eval(user_input) -+ return result -""" - - prompt = get_unified_review_prompt(pr_data, pr_diff) - - assert isinstance(prompt, str) - assert len(prompt) > 0 - assert "123" in prompt - assert "Add new feature" in prompt - assert "testuser" in prompt - assert "app.py" in prompt - assert "eval(user_input)" in prompt - assert "code quality" in prompt.lower() - assert "security" in prompt.lower() - - def test_get_unified_review_prompt_without_diff_uses_file_reading_instructions(self): - pr_data = _sample_pr_data() - - prompt = get_unified_review_prompt(pr_data, pr_diff="diff --git a/a b/a", include_diff=False) - - assert "PR DIFF CONTENT:" not in prompt - assert "IMPORTANT - FILE READING INSTRUCTIONS:" in prompt - - def test_get_unified_review_prompt_no_files(self): - pr_data = _sample_pr_data() - pr_data["changed_files"] = 0 - pr_data["files"] = [] - - prompt = get_unified_review_prompt(pr_data, pr_diff="") - - assert isinstance(prompt, str) - assert "Files changed: 0" in prompt - - def test_get_unified_review_prompt_structure(self): - pr_data = _sample_pr_data() - pr_data["title"] = "Test PR" - - pr_diff = "diff --git a/test.py b/test.py\n+print('test')" - prompt = get_unified_review_prompt(pr_data, pr_diff) - - assert "CONTEXT:" in prompt - assert "OBJECTIVE:" in prompt - assert "REQUIRED OUTPUT FORMAT:" in prompt - assert pr_diff in prompt - - def test_get_unified_review_prompt_long_diff(self): - pr_data = { - "number": 12345, - "title": "Major refactoring", - "body": "Refactoring the entire codebase", - "user": "refactor-bot", - "changed_files": 10, - "additions": 1000, - "deletions": 500, - "head": { - "repo": { - "full_name": "owner/repo" - } - }, - "files": [ - { - "filename": f"file{i}.py", - "status": "modified", - "additions": 100, - "deletions": 50, - } - for i in range(10) - ], - } - - pr_diff = "\n".join([ - f"diff --git a/file{i}.py b/file{i}.py\n" + - "\n".join([f"+line {j}" for j in range(50)]) - for i in range(10) - ]) - - prompt = get_unified_review_prompt(pr_data, pr_diff) - - assert isinstance(prompt, str) - assert len(prompt) > 1000 - assert "12345" in prompt - assert "Major refactoring" in prompt - - def test_get_unified_review_prompt_unicode(self): - pr_data = { - "number": 666, - "title": "Add emoji support", - "body": "This PR adds emoji rendering", - "user": "emoji-user", - "changed_files": 1, - "additions": 42, - "deletions": 0, - "head": { - "repo": { - "full_name": "owner/repo" - } - }, - "files": [ - { - "filename": "emoji.py", - "status": "added", - "additions": 42, - "deletions": 0, - } - ], - } - - pr_diff = """ -diff --git a/emoji.py b/emoji.py -+# Security check -+def check_input(text: str) -> bool: -+ return "ALERT" not in text -""" - - prompt = get_unified_review_prompt(pr_data, pr_diff) - - assert "emoji-user" in prompt - assert "emoji.py" in prompt - assert "ALERT" in prompt - - def test_get_unified_review_prompt_custom_instructions(self): - pr_data = _sample_pr_data() - - prompt = get_unified_review_prompt( - pr_data, - pr_diff="diff --git a/app.py b/app.py", - custom_review_instructions="Check transaction consistency.", - custom_security_instructions="Check GraphQL authz.", - ) - - assert "Check transaction consistency." in prompt - assert "Check GraphQL authz." in prompt - - def test_get_unified_review_prompt_includes_pr_summary_schema(self): - """Test that prompt includes pr_summary in JSON schema.""" - pr_data = _sample_pr_data() - prompt = get_unified_review_prompt(pr_data, pr_diff="diff") - - assert '"pr_summary"' in prompt - assert '"overview"' in prompt - assert '"file_changes"' in prompt - assert '"label"' in prompt - assert '"files"' in prompt - - def test_get_unified_review_prompt_includes_summary_guidelines(self): - """Test that prompt includes PR summary guidelines.""" - pr_data = _sample_pr_data() - prompt = get_unified_review_prompt(pr_data, pr_diff="diff") - - assert "PR SUMMARY GUIDELINES:" in prompt - assert "2-4 sentences" in prompt - assert "~10 words" in prompt \ No newline at end of file +def test_build_hybrid_diff_section_max_lines_zero_omits_inline_diff(): + section = _build_hybrid_diff_section("diff --git a/a.py b/a.py\n+print('x')", 0) + + assert "intentionally omitted" in section + assert "```diff" not in section + + +def test_build_hybrid_diff_section_truncates_when_over_limit(): + diff = "\n".join([f"+line {i}" for i in range(20)]) + + section = _build_hybrid_diff_section(diff, 5) + + assert "TRUNCATED" in section + assert "5 lines out of 20" in section + + +def test_triage_prompt_contains_required_schema(): + prompt = build_triage_prompt(_sample_pr_data(), "diff --git a/app.py b/app.py", 100) + + assert '"skip_review"' in prompt + assert '"risk_level"' in prompt + + +def test_context_prompt_contains_discovery_fields(): + prompt = build_context_discovery_prompt(_sample_pr_data(), "diff --git a/app.py b/app.py", 100) + + assert '"claude_md_files"' in prompt + assert '"priority_files"' in prompt + + +def test_specialist_prompts_include_findings_schema_and_custom_instructions(): + pr_data = _sample_pr_data() + context = {"hotspots": ["app.py"]} + + compliance = build_compliance_prompt(pr_data, "diff", 100, context) + quality = build_quality_prompt(pr_data, "diff", 100, context, custom_review_instructions="Check tx safety") + security = build_security_prompt(pr_data, "diff", 100, context, custom_security_instructions="Check SSRF") + + for prompt in [compliance, quality, security]: + assert '"findings"' in prompt + assert '"confidence"' in prompt + + assert "rule_reference" in compliance + assert "Check tx safety" in quality + assert "exploit_preconditions" in security + assert "Check SSRF" in security + + +def test_validation_prompt_contains_candidate_findings(): + findings = [{"file": "app.py", "line": 10, "severity": "HIGH"}] + + prompt = build_validation_prompt(_sample_pr_data(), "diff", 100, findings) + + assert "CANDIDATE FINDINGS" in prompt + assert '"validated_findings"' in prompt + assert '"finding_index"' in prompt diff --git a/claudecode/test_review_orchestrator.py b/claudecode/test_review_orchestrator.py new file mode 100644 index 0000000..8dec762 --- /dev/null +++ b/claudecode/test_review_orchestrator.py @@ -0,0 +1,44 @@ +"""Unit tests for review_orchestrator.""" + +from pathlib import Path +from unittest.mock import Mock + +from claudecode.review_orchestrator import ReviewModelConfig, ReviewOrchestrator + + +def test_review_model_config_from_env_with_fallbacks(): + env = { + "CLAUDE_MODEL": "global-model", + "MODEL_TRIAGE": "triage-model", + "MODEL_SECURITY": "security-model", + } + cfg = ReviewModelConfig.from_env(env, "default-model") + assert cfg.triage == "triage-model" + assert cfg.compliance == "global-model" + assert cfg.quality == "global-model" + assert cfg.security == "security-model" + assert cfg.validation == "global-model" + + +def test_orchestrator_invalid_triage_schema_fails(): + runner = Mock() + runner.run_prompt.return_value = ( + True, + "", + {"findings": [{"file": "a.py", "line": 3, "severity": "HIGH", "category": "security", "title": "Issue"}]}, + ) + findings_filter = Mock() + github_client = Mock() + github_client.is_excluded.return_value = False + cfg = ReviewModelConfig("t", "c", "q", "s", "v") + + orchestrator = ReviewOrchestrator(runner, findings_filter, github_client, cfg, 100) + ok, result, err = orchestrator.run( + repo_dir=Path("."), + pr_data={"number": 1, "changed_files": 1, "head": {"repo": {"full_name": "x/y"}}}, + pr_diff="diff --git a/a.py b/a.py", + ) + + assert ok is False + assert result == {} + assert "invalid schema" in err.lower() diff --git a/claudecode/test_workflow_integration.py b/claudecode/test_workflow_integration.py index e5410d6..fa43290 100644 --- a/claudecode/test_workflow_integration.py +++ b/claudecode/test_workflow_integration.py @@ -13,6 +13,31 @@ from claudecode.github_action_audit import main +def _mock_claude_result(payload): + """Build a mocked subprocess result containing JSON payload.""" + return Mock(returncode=0, stdout=json.dumps(payload), stderr='') + + +def _multiphase_run_side_effect(findings): + """Create subprocess.run side effects for strict multi-phase flow.""" + return [ + Mock(returncode=0, stdout='claude version 1.0.0', stderr=''), # claude --version + _mock_claude_result({"skip_review": False, "reason": "", "risk_level": "medium"}), # triage + _mock_claude_result({"claude_md_files": [], "change_summary": "", "hotspots": [], "priority_files": []}), # context + _mock_claude_result({"findings": findings}), # compliance + _mock_claude_result({"findings": []}), # quality + _mock_claude_result({"findings": []}), # security + _mock_claude_result( + { + "validated_findings": [ + {"finding_index": idx, "keep": True, "confidence": 0.95, "reason": "valid"} + for idx in range(len(findings)) + ] + } + ), # validation + ] + + class TestFullWorkflowIntegration: """Test complete workflow scenarios.""" @@ -242,23 +267,7 @@ def test_full_workflow_with_real_pr_structure(self, mock_get, mock_run): ] } - # Mock Claude CLI - version_result = Mock() - version_result.returncode = 0 - version_result.stdout = 'claude version 1.0.0' - version_result.stderr = '' - - audit_result = Mock() - audit_result.returncode = 0 - # Claude wraps the result in a specific format - claude_wrapped_response = { - 'result': json.dumps(claude_response) - } - audit_result.stdout = json.dumps(claude_wrapped_response) - audit_result.stderr = '' - - # Provide results for unified review (single pass) - mock_run.side_effect = [version_result, audit_result] + mock_run.side_effect = _multiphase_run_side_effect(claude_response["findings"]) # Run the workflow with tempfile.TemporaryDirectory() as tmpdir: @@ -279,12 +288,12 @@ def test_full_workflow_with_real_pr_structure(self, mock_get, mock_run): assert exc_info.value.code == 1 # Verify API calls - # 5 calls: PR data, files, diff, comments, reactions for bot comment - assert mock_get.call_count == 5 - assert mock_run.call_count == 2 # 1 version check + 1 unified review + # 3 calls: PR data, files, diff + assert mock_get.call_count == 3 + assert mock_run.call_count == 7 # 1 version check + 6 phase calls # Verify the audit was run with proper prompt - audit_call = mock_run.call_args_list[1] + audit_call = mock_run.call_args_list[1] # triage prompt prompt = audit_call[1]['input'] assert 'Add new authentication feature' in prompt # Title assert 'src/auth/oauth2.py' in prompt # File name @@ -337,11 +346,7 @@ def test_workflow_with_llm_filtering(self, mock_get, mock_run): } ] - mock_run.side_effect = [ - Mock(returncode=0, stdout='claude version 1.0.0', stderr=''), - Mock(returncode=0, stdout=json.dumps({"findings": claude_findings}), stderr=''), - Mock(returncode=0, stdout=json.dumps({"findings": []}), stderr='') - ] + mock_run.side_effect = _multiphase_run_side_effect(claude_findings) with tempfile.TemporaryDirectory() as tmpdir: os.chdir(tmpdir) @@ -415,11 +420,7 @@ def test_workflow_with_no_review_issues(self, mock_get, mock_run): mock_get.side_effect = [pr_response, files_response, diff_response] # Claude finds no issues - mock_run.side_effect = [ - Mock(returncode=0, stdout='claude version 1.0.0', stderr=''), - Mock(returncode=0, stdout='{"findings": [], "analysis_summary": {"review_completed": true}}', stderr=''), - Mock(returncode=0, stdout='{"findings": [], "analysis_summary": {"review_completed": true}}', stderr='') - ] + mock_run.side_effect = _multiphase_run_side_effect([]) with tempfile.TemporaryDirectory() as tmpdir: os.chdir(tmpdir) @@ -506,11 +507,7 @@ def test_workflow_with_massive_pr(self, mock_get, mock_run): mock_get.side_effect = [pr_response, files_response, diff_response] # Claude handles it gracefully - mock_run.side_effect = [ - Mock(returncode=0, stdout='claude version 1.0.0', stderr=''), - Mock(returncode=0, stdout='{"findings": []}', stderr=''), - Mock(returncode=0, stdout='{"findings": []}', stderr='') - ] + mock_run.side_effect = _multiphase_run_side_effect([]) with tempfile.TemporaryDirectory() as tmpdir: os.chdir(tmpdir) @@ -586,11 +583,7 @@ def test_workflow_with_binary_files(self, mock_get, mock_run): mock_get.side_effect = [pr_response, files_response, diff_response] - mock_run.side_effect = [ - Mock(returncode=0, stdout='claude version 1.0.0', stderr=''), - Mock(returncode=0, stdout='{"findings": []}', stderr=''), - Mock(returncode=0, stdout='{"findings": []}', stderr='') - ] + mock_run.side_effect = _multiphase_run_side_effect([]) with tempfile.TemporaryDirectory() as tmpdir: os.chdir(tmpdir) diff --git a/scripts/comment-pr-findings.bun.test.js b/scripts/comment-pr-findings.bun.test.js index e5dae90..d28878c 100644 --- a/scripts/comment-pr-findings.bun.test.js +++ b/scripts/comment-pr-findings.bun.test.js @@ -1026,7 +1026,7 @@ describe('comment-pr-findings.js', () => { id: 201, state: 'APPROVED', user: { type: 'Bot' }, - body: '📋 **PR Summary:**\nThis PR adds auth.\n\n---\n\nNo issues found. Changes look good.' + body: '\nNo issues found. Changes look good.' } ]; @@ -1076,7 +1076,7 @@ describe('comment-pr-findings.js', () => { test('should update existing review in place when state is unchanged and no inline comments', async () => { // Existing APPROVED review, new findings also result in APPROVED (no HIGH severity) const mockReviews = [ - { id: 101, state: 'APPROVED', user: { type: 'Bot' }, body: 'No issues found. Changes look good.' } + { id: 101, state: 'APPROVED', user: { type: 'Bot' }, body: '\nNo issues found. Changes look good.' } ]; readFileSyncSpy.mockImplementation((path) => { @@ -1129,7 +1129,7 @@ describe('comment-pr-findings.js', () => { test('should dismiss and create new review when state changes', async () => { // Existing APPROVED review, but new findings have HIGH severity = CHANGES_REQUESTED const mockReviews = [ - { id: 101, state: 'APPROVED', user: { type: 'Bot' }, body: 'No issues found. Changes look good.' } + { id: 101, state: 'APPROVED', user: { type: 'Bot' }, body: '\nNo issues found. Changes look good.' } ]; const mockFindings = [{ diff --git a/scripts/comment-pr-findings.js b/scripts/comment-pr-findings.js index 270f98b..0dbd4ee 100755 --- a/scripts/comment-pr-findings.js +++ b/scripts/comment-pr-findings.js @@ -6,6 +6,7 @@ const fs = require('fs'); const { spawnSync } = require('child_process'); +const REVIEW_MARKER = ''; // PR Summary marker for identifying our summary sections const PR_SUMMARY_MARKER = '📋 **PR Summary:**'; @@ -102,26 +103,7 @@ function addReactionsToReview(reviewId) { // Check if a review was posted by this action function isOwnReview(review) { if (!review.body) return false; - - // Check for our review summary patterns - const ownPatterns = [ - PR_SUMMARY_MARKER, - 'No issues found. Changes look good.', - /^Found \d+ .+ issues?\./, - 'Please address the high-severity issues before merging.', - 'Consider addressing the suggestions in the comments.', - 'Minor suggestions noted in comments.' - ]; - - for (const pattern of ownPatterns) { - if (pattern instanceof RegExp) { - if (pattern.test(review.body)) return true; - } else { - if (review.body.includes(pattern)) return true; - } - } - - return false; + return review.body.includes(REVIEW_MARKER); } // Find an existing review posted by this action @@ -135,8 +117,8 @@ function findExistingReview() { for (const review of reviews) { const isDismissible = review.state === 'APPROVED' || review.state === 'CHANGES_REQUESTED'; - const isBot = review.user && review.user.type === 'Bot'; const isOwn = isOwnReview(review); + const isBot = review.user && review.user.type === 'Bot'; if (isBot && isDismissible && isOwn) { return review; @@ -293,7 +275,7 @@ async function run() { const highSeverityCount = analysisSummary.high_severity || 0; const reviewEvent = highSeverityCount > 0 ? 'REQUEST_CHANGES' : 'APPROVE'; - const reviewBody = buildReviewSummary(newFindings, prSummary, analysisSummary); + const reviewBody = `${buildReviewSummary(newFindings, prSummary, analysisSummary)}\n\n${REVIEW_MARKER}`; // Prepare review comments const reviewComments = []; diff --git a/scripts/determine-claudecode-enablement.sh b/scripts/determine-claudecode-enablement.sh index a71ffc2..5a9865d 100755 --- a/scripts/determine-claudecode-enablement.sh +++ b/scripts/determine-claudecode-enablement.sh @@ -12,7 +12,6 @@ # - TRIGGER_ON_COMMIT: Whether to run on new commits (true/false) # - TRIGGER_ON_REVIEW_REQUEST: Whether to run on review requests (true/false) # - TRIGGER_ON_MENTION: Whether to run on bot mentions (true/false) -# - RUN_EVERY_COMMIT: Legacy flag for running on commits (true/false) # - SKIP_DRAFT_PRS: Whether to skip draft PRs (true/false) # - IS_DRAFT: Whether the PR is a draft (true/false) # - REQUIRE_LABEL: Required label name (optional) @@ -54,9 +53,8 @@ if [ "$ENABLE_CLAUDECODE" == "true" ]; then fi ;; commit) - # Check both new and legacy input names (run-every-commit is alias for trigger-on-commit) - if [ "${TRIGGER_ON_COMMIT:-false}" != "true" ] && [ "${RUN_EVERY_COMMIT:-false}" != "true" ]; then - echo "Trigger 'commit' is disabled via trigger-on-commit/run-every-commit input" + if [ "${TRIGGER_ON_COMMIT:-false}" != "true" ]; then + echo "Trigger 'commit' is disabled via trigger-on-commit input" ENABLE_CLAUDECODE="false" fi ;; @@ -121,4 +119,4 @@ if [ "$ENABLE_CLAUDECODE" == "true" ]; then echo "ClaudeCode is enabled for this run (trigger: $TRIGGER_TYPE, PR: #$PR_NUMBER, SHA: $GITHUB_SHA)" else echo "ClaudeCode is disabled for this run" -fi \ No newline at end of file +fi diff --git a/scripts/test-determine-claudecode-enablement.sh b/scripts/test-determine-claudecode-enablement.sh index 404e56c..7bc06e2 100755 --- a/scripts/test-determine-claudecode-enablement.sh +++ b/scripts/test-determine-claudecode-enablement.sh @@ -35,7 +35,7 @@ teardown_test() { # Unset all environment variables that tests might set unset GITHUB_EVENT_NAME PR_NUMBER GITHUB_SHA TRIGGER_TYPE unset TRIGGER_ON_OPEN TRIGGER_ON_COMMIT TRIGGER_ON_REVIEW_REQUEST TRIGGER_ON_MENTION - unset RUN_EVERY_COMMIT SKIP_DRAFT_PRS IS_DRAFT REQUIRE_LABEL PR_LABELS IS_PR + unset SKIP_DRAFT_PRS IS_DRAFT REQUIRE_LABEL PR_LABELS IS_PR } # Helper to run the script and capture output @@ -143,24 +143,6 @@ test_commit_trigger_with_new_input() { teardown_test } -test_commit_trigger_with_legacy_input() { - echo "Test: Commit trigger with run-every-commit (legacy) enabled" - setup_test - - export GITHUB_EVENT_NAME="pull_request" - export PR_NUMBER="123" - export GITHUB_SHA="abc123" - export TRIGGER_TYPE="commit" - export TRIGGER_ON_COMMIT="false" - export RUN_EVERY_COMMIT="true" - - run_script - - assert_equals "true" "$ENABLE_CLAUDECODE" "Should enable for legacy run-every-commit" - - teardown_test -} - test_commit_trigger_disabled() { echo "Test: Commit trigger disabled" setup_test @@ -170,7 +152,6 @@ test_commit_trigger_disabled() { export GITHUB_SHA="abc123" export TRIGGER_TYPE="commit" export TRIGGER_ON_COMMIT="false" - export RUN_EVERY_COMMIT="false" run_script @@ -453,7 +434,6 @@ test_unsupported_event_type test_pull_request_event_open_trigger_enabled test_pull_request_event_open_trigger_disabled test_commit_trigger_with_new_input -test_commit_trigger_with_legacy_input test_commit_trigger_disabled test_review_request_trigger test_mention_trigger @@ -484,4 +464,4 @@ if [ $TESTS_FAILED -eq 0 ]; then else echo -e "${RED}Some tests failed!${NC}" exit 1 -fi \ No newline at end of file +fi