From d3ae6d579f529a29ed6356ec8664ac5eadaac7de Mon Sep 17 00:00:00 2001 From: Matej Bukovinski Date: Mon, 9 Feb 2026 13:14:31 +0100 Subject: [PATCH 01/12] Refactor review action to multi-phase orchestrator with model routing --- .claude/agents/compliance.md | 8 + .claude/agents/quality.md | 8 + .claude/agents/security.md | 8 + .claude/agents/triage.md | 8 + .claude/agents/validator.md | 8 + .claude/commands/review.md | 7 + README.md | 17 +- action.yml | 30 ++ claudecode/__init__.py | 5 + claudecode/findings_merge.py | 61 ++++ claudecode/github_action_audit.py | 395 +++++++--------------- claudecode/prompts.py | 535 ++++++++++++++++-------------- claudecode/review_orchestrator.py | 293 ++++++++++++++++ 13 files changed, 853 insertions(+), 530 deletions(-) create mode 100644 .claude/agents/compliance.md create mode 100644 .claude/agents/quality.md create mode 100644 .claude/agents/security.md create mode 100644 .claude/agents/triage.md create mode 100644 .claude/agents/validator.md create mode 100644 claudecode/findings_merge.py create mode 100644 claudecode/review_orchestrator.py diff --git a/.claude/agents/compliance.md b/.claude/agents/compliance.md new file mode 100644 index 0000000..463e14f --- /dev/null +++ b/.claude/agents/compliance.md @@ -0,0 +1,8 @@ +--- +name: compliance +model: claude-sonnet-4-20250514 +description: CLAUDE.md compliance specialist +--- + +Audit changed files against relevant CLAUDE.md guidance. +Return only JSON findings with concrete rule references. diff --git a/.claude/agents/quality.md b/.claude/agents/quality.md new file mode 100644 index 0000000..e1bf26b --- /dev/null +++ b/.claude/agents/quality.md @@ -0,0 +1,8 @@ +--- +name: quality +model: claude-opus-4-5-20251101 +description: Code quality specialist for correctness and reliability +--- + +Find high-signal correctness, reliability, and performance issues. +Return only JSON findings. diff --git a/.claude/agents/security.md b/.claude/agents/security.md new file mode 100644 index 0000000..ea521cd --- /dev/null +++ b/.claude/agents/security.md @@ -0,0 +1,8 @@ +--- +name: security +model: claude-opus-4-5-20251101 +description: Security specialist for exploitable vulnerabilities +--- + +Find exploitable vulnerabilities in changed code with concrete attack paths. +Return only JSON findings including exploit preconditions and trust boundary. diff --git a/.claude/agents/triage.md b/.claude/agents/triage.md new file mode 100644 index 0000000..33591ba --- /dev/null +++ b/.claude/agents/triage.md @@ -0,0 +1,8 @@ +--- +name: triage +model: claude-3-5-haiku-20241022 +description: Fast PR triage for skip/continue decisions +--- + +Determine whether review can be skipped safely. +Return only JSON with `skip_review`, `reason`, and `risk_level`. diff --git a/.claude/agents/validator.md b/.claude/agents/validator.md new file mode 100644 index 0000000..8235ee9 --- /dev/null +++ b/.claude/agents/validator.md @@ -0,0 +1,8 @@ +--- +name: validator +model: claude-sonnet-4-20250514 +description: Finding validation and deduplication specialist +--- + +Validate candidate findings with strict confidence and impact criteria. +Return only JSON decisions for keep/drop. diff --git a/.claude/commands/review.md b/.claude/commands/review.md index 50ae979..9a9e14f 100644 --- a/.claude/commands/review.md +++ b/.claude/commands/review.md @@ -40,6 +40,12 @@ To do this, follow these steps precisely: Agent 4: Opus security agent Look for security vulnerabilities in the introduced code. This includes injection, auth bypass, data exposure, unsafe deserialization, or other exploitable issues. Only look for issues that fall within the changed code. + Security evidence requirements for every reported issue: + - Include a concrete exploit or abuse path. + - Include attacker preconditions. + - Identify the impacted trust boundary or sensitive asset. + - Provide an actionable mitigation. + **CRITICAL: We only want HIGH SIGNAL issues.** Flag issues where: - The code will fail to compile or parse (syntax errors, type errors, missing imports, unresolved references) - The code will definitely produce wrong results regardless of inputs (clear logic errors) @@ -52,6 +58,7 @@ To do this, follow these steps precisely: - Subjective suggestions or improvements - Security issues that depend on speculative inputs or unverified assumptions - Denial of Service (DoS) or rate limiting issues without concrete exploitability + - Findings based only on diff snippets without validating surrounding repository context If you are not certain an issue is real, do not flag it. False positives erode trust and waste reviewer time. diff --git a/README.md b/README.md index 2474766..06285d3 100644 --- a/README.md +++ b/README.md @@ -112,6 +112,11 @@ This action is not hardened against prompt injection attacks and should only be | `upload-results` | Whether to upload results as artifacts | `true` | No | | `exclude-directories` | Comma-separated list of directories to exclude from scanning | None | No | | `claude-model` | Claude [model name](https://docs.anthropic.com/en/docs/about-claude/models/overview#model-names) to use. Defaults to Opus 4.5. | `claude-opus-4-5-20251101` | No | +| `model-triage` | Model used for triage phase (skip/continue decision). | `claude-3-5-haiku-20241022` | No | +| `model-compliance` | Model used for CLAUDE.md compliance phase. | `claude-sonnet-4-20250514` | No | +| `model-quality` | Model used for code quality phase. | `claude-opus-4-5-20251101` | No | +| `model-security` | Model used for security phase. | `claude-opus-4-5-20251101` | No | +| `model-validation` | Model used for finding validation phase. | `claude-sonnet-4-20250514` | No | | `claudecode-timeout` | Timeout for ClaudeCode analysis in minutes | `20` | No | | `run-every-commit` | Run ClaudeCode on every commit (skips cache check). Warning: May increase false positives on PRs with many commits. **Deprecated**: Use `trigger-on-commit` instead. | `false` | No | | `trigger-on-open` | Run review when PR is first opened | `true` | No | @@ -127,6 +132,7 @@ This action is not hardened against prompt injection attacks and should only be | `skip-draft-prs` | Skip code review on draft pull requests | `true` | No | | `app-slug` | GitHub App slug for bot mention detection. If using `actions/create-github-app-token@v1.9.0+`, pass `${{ steps.app-token.outputs.app-slug }}`. Otherwise defaults to `github-actions`. | `github-actions` | No | | `require-label` | Only run review if this label is present. Leave empty to review all PRs. Add `labeled` to your workflow `pull_request` types to trigger on label addition. | None | No | +| `max-diff-lines` | Maximum inline diff lines included as prompt anchor; repository tool reads are still required in all cases. | `5000` | No | ### Action Outputs @@ -294,11 +300,12 @@ claudecode/ ### Workflow -1. **PR Analysis**: When a pull request is opened, Claude analyzes the diff to understand what changed -2. **Contextual Review**: Claude examines the code changes in context, understanding the purpose and potential impacts -3. **Finding Generation**: Issues are identified with detailed explanations, severity ratings, and remediation guidance -4. **False Positive Filtering**: Advanced filtering removes low-impact or false positive prone findings to reduce noise -5. **PR Comments**: Findings are posted as review comments on the specific lines of code +1. **Triage Phase**: A fast triage pass determines if review should proceed. +2. **Context Discovery**: Claude discovers relevant CLAUDE.md files, hotspots, and risky code paths. +3. **Specialist Review**: Dedicated compliance, quality, and security phases run with configurable models. +4. **Validation Phase**: Candidate findings are validated and deduplicated for high signal. +5. **False Positive Filtering**: Additional filtering removes low-impact noise. +6. **PR Comments**: Findings are posted as review comments on specific lines in the PR. ## Review Capabilities diff --git a/action.yml b/action.yml index ec846aa..8f0485e 100644 --- a/action.yml +++ b/action.yml @@ -33,6 +33,31 @@ inputs: required: false default: '' + model-triage: + description: 'Model for triage phase' + required: false + default: 'claude-3-5-haiku-20241022' + + model-compliance: + description: 'Model for CLAUDE.md compliance phase' + required: false + default: 'claude-sonnet-4-20250514' + + model-quality: + description: 'Model for code quality phase' + required: false + default: 'claude-opus-4-5-20251101' + + model-security: + description: 'Model for security phase' + required: false + default: 'claude-opus-4-5-20251101' + + model-validation: + description: 'Model for validation phase' + required: false + default: 'claude-sonnet-4-20250514' + run-every-commit: description: 'DEPRECATED: Use trigger-on-commit instead. Run ClaudeCode on every commit (skips cache check). Warning: This may lead to more false positives on PRs with many commits as the AI analyzes the same code multiple times.' required: false @@ -351,6 +376,11 @@ runs: CUSTOM_REVIEW_INSTRUCTIONS: ${{ inputs.custom-review-instructions }} CUSTOM_SECURITY_SCAN_INSTRUCTIONS: ${{ inputs.custom-security-scan-instructions }} CLAUDE_MODEL: ${{ inputs.claude-model }} + MODEL_TRIAGE: ${{ inputs.model-triage }} + MODEL_COMPLIANCE: ${{ inputs.model-compliance }} + MODEL_QUALITY: ${{ inputs.model-quality }} + MODEL_SECURITY: ${{ inputs.model-security }} + MODEL_VALIDATION: ${{ inputs.model-validation }} CLAUDECODE_TIMEOUT: ${{ inputs.claudecode-timeout }} MAX_DIFF_LINES: ${{ inputs.max-diff-lines }} ACTION_PATH: ${{ github.action_path }} diff --git a/claudecode/__init__.py b/claudecode/__init__.py index d0cbfdc..07611ed 100644 --- a/claudecode/__init__.py +++ b/claudecode/__init__.py @@ -12,11 +12,16 @@ from claudecode.github_action_audit import ( GitHubActionClient, SimpleClaudeRunner, + get_review_model_config, main ) +from claudecode.review_orchestrator import ReviewModelConfig, ReviewOrchestrator __all__ = [ "GitHubActionClient", "SimpleClaudeRunner", + "ReviewModelConfig", + "ReviewOrchestrator", + "get_review_model_config", "main" ] diff --git a/claudecode/findings_merge.py b/claudecode/findings_merge.py new file mode 100644 index 0000000..aae73cf --- /dev/null +++ b/claudecode/findings_merge.py @@ -0,0 +1,61 @@ +"""Utilities for merging and deduplicating findings from multiple phases.""" + +from typing import Any, Dict, List, Tuple + + +def _normalize_text(value: Any) -> str: + return str(value or "").strip().lower() + + +def _finding_key(finding: Dict[str, Any]) -> Tuple[str, int, str, str]: + file_path = _normalize_text(finding.get("file")) + line = finding.get("line") + try: + line_no = int(line) + except (TypeError, ValueError): + line_no = 1 + category = _normalize_text(finding.get("category")) + title = _normalize_text(finding.get("title")) + return file_path, line_no, category, title + + +def _severity_rank(value: Any) -> int: + sev = _normalize_text(value).upper() + if sev == "HIGH": + return 3 + if sev == "MEDIUM": + return 2 + if sev == "LOW": + return 1 + return 0 + + +def _confidence_value(value: Any) -> float: + try: + return float(value) + except (TypeError, ValueError): + return 0.0 + + +def merge_findings(findings: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Merge duplicate findings and keep the strongest candidate.""" + merged: Dict[Tuple[str, int, str, str], Dict[str, Any]] = {} + + for finding in findings: + if not isinstance(finding, dict): + continue + + key = _finding_key(finding) + existing = merged.get(key) + + if existing is None: + merged[key] = finding + continue + + incoming_score = (_severity_rank(finding.get("severity")), _confidence_value(finding.get("confidence"))) + existing_score = (_severity_rank(existing.get("severity")), _confidence_value(existing.get("confidence"))) + + if incoming_score > existing_score: + merged[key] = finding + + return list(merged.values()) diff --git a/claudecode/github_action_audit.py b/claudecode/github_action_audit.py index dbf6766..b46ae3e 100644 --- a/claudecode/github_action_audit.py +++ b/claudecode/github_action_audit.py @@ -15,10 +15,10 @@ import time # Import existing components we can reuse -from claudecode.prompts import get_unified_review_prompt from claudecode.findings_filter import FindingsFilter from claudecode.json_parser import parse_json_with_fallbacks -from claudecode.format_pr_comments import format_pr_comments_for_prompt, is_bot_comment +from claudecode.prompts import get_unified_review_prompt # Backward-compatible import for tests/extensions. +from claudecode.review_orchestrator import ReviewModelConfig, ReviewOrchestrator from claudecode.constants import ( EXIT_CONFIGURATION_ERROR, DEFAULT_CLAUDE_MODEL, @@ -27,7 +27,6 @@ SUBPROCESS_TIMEOUT ) from claudecode.logger import get_logger -from claudecode.review_schema import REVIEW_OUTPUT_SCHEMA logger = get_logger(__name__) @@ -186,98 +185,23 @@ def get_pr_data(self, repo_name: str, pr_number: int) -> Dict[str, Any]: def get_pr_diff(self, repo_name: str, pr_number: int) -> str: """Get complete PR diff in unified format. - + Args: repo_name: Repository name in format "owner/repo" pr_number: Pull request number - + Returns: Complete PR diff in unified format """ url = f"https://api.github.com/repos/{repo_name}/pulls/{pr_number}" headers = dict(self.headers) headers['Accept'] = 'application/vnd.github.diff' - + response = requests.get(url, headers=headers) response.raise_for_status() - + return self._filter_generated_files(response.text) - - def get_pr_comments(self, repo_name: str, pr_number: int) -> List[Dict[str, Any]]: - """Get all review comments for a PR with pagination. - - Args: - repo_name: Repository name in format "owner/repo" - pr_number: Pull request number - - Returns: - List of comment dictionaries from GitHub API - """ - all_comments = [] - page = 1 - per_page = 100 - - while True: - url = f"https://api.github.com/repos/{repo_name}/pulls/{pr_number}/comments" - params = {'per_page': per_page, 'page': page} - - try: - response = requests.get(url, headers=self.headers, params=params) - response.raise_for_status() - comments = response.json() - - if not comments: - break - - all_comments.extend(comments) - - # Check if there are more pages - if len(comments) < per_page: - break - - page += 1 - - except requests.RequestException as e: - logger.warning(f"Failed to fetch comments page {page}: {e}") - break - - return all_comments - - def get_comment_reactions(self, repo_name: str, comment_id: int) -> Dict[str, int]: - """Get reactions for a specific comment, excluding bot reactions. - - Args: - repo_name: Repository name in format "owner/repo" - comment_id: The comment ID to fetch reactions for - - Returns: - Dictionary with reaction counts (e.g., {'+1': 3, '-1': 1}) - """ - url = f"https://api.github.com/repos/{repo_name}/pulls/comments/{comment_id}/reactions" - - try: - response = requests.get(url, headers=self.headers) - response.raise_for_status() - reactions = response.json() - - # Count reactions, excluding those from bots - counts = {} - for reaction in reactions: - user = reaction.get('user', {}) - # Skip bot reactions (the bot adds its own 👍👎 as seeds) - if user.get('type') == 'Bot': - continue - - content = reaction.get('content', '') - if content: - counts[content] = counts.get(content, 0) + 1 - - return counts - - except requests.RequestException as e: - logger.debug(f"Failed to fetch reactions for comment {comment_id}: {e}") - return {} - + def _is_excluded(self, filepath: str) -> bool: """Check if a file should be excluded based on directory or file patterns.""" import fnmatch @@ -355,113 +279,105 @@ def __init__(self, timeout_minutes: Optional[int] = None): else: self.timeout_seconds = SUBPROCESS_TIMEOUT - def run_code_review(self, repo_dir: Path, prompt: str) -> Tuple[bool, str, Dict[str, Any]]: - """Run Claude Code review. - - Args: - repo_dir: Path to repository directory - prompt: Code review prompt - - Returns: - Tuple of (success, error_message, parsed_results) - """ + def run_prompt(self, repo_dir: Path, prompt: str, model: Optional[str] = None) -> Tuple[bool, str, Any]: + """Run a single Claude prompt and return parsed JSON payload or text.""" if not repo_dir.exists(): return False, f"Repository directory does not exist: {repo_dir}", {} - - # Check prompt size + prompt_size = len(prompt.encode('utf-8')) if prompt_size > 1024 * 1024: # 1MB print(f"[Warning] Large prompt size: {prompt_size / 1024 / 1024:.2f}MB", file=sys.stderr) - + + model_name = model or DEFAULT_CLAUDE_MODEL try: - # Construct Claude Code command - # Use stdin for prompt to avoid "argument list too long" error cmd = [ 'claude', '--output-format', 'json', - '--model', DEFAULT_CLAUDE_MODEL, - '--disallowed-tools', 'Bash(ps:*)', - '--json-schema', json.dumps(REVIEW_OUTPUT_SCHEMA) + '--model', model_name, + '--disallowed-tools', 'Bash(ps:*)' ] - - # Run Claude Code with retry logic + NUM_RETRIES = 3 for attempt in range(NUM_RETRIES): result = subprocess.run( cmd, - input=prompt, # Pass prompt via stdin + input=prompt, cwd=repo_dir, capture_output=True, text=True, timeout=self.timeout_seconds ) - # Parse JSON output (even if returncode != 0, to detect specific errors) - success, parsed_result = parse_json_with_fallbacks(result.stdout, "Claude Code output") + if result.returncode != 0: + if attempt == NUM_RETRIES - 1: + error_details = f"Claude Code execution failed with return code {result.returncode}\n" + error_details += f"Stderr: {result.stderr}\n" + error_details += f"Stdout: {result.stdout[:500]}..." + return False, error_details, {} + time.sleep(5 * attempt) + continue + success, parsed_result = parse_json_with_fallbacks(result.stdout, "Claude Code output") if success: - # Check for "Prompt is too long" error that should trigger fallback to agentic mode - if (isinstance(parsed_result, dict) and - parsed_result.get('type') == 'result' and + if (isinstance(parsed_result, dict) and + parsed_result.get('type') == 'result' and parsed_result.get('subtype') == 'success' and parsed_result.get('is_error') and parsed_result.get('result') == 'Prompt is too long'): return False, "PROMPT_TOO_LONG", {} - # Check for error_during_execution that should trigger retry - if (isinstance(parsed_result, dict) and - parsed_result.get('type') == 'result' and + if (isinstance(parsed_result, dict) and + parsed_result.get('type') == 'result' and parsed_result.get('subtype') == 'error_during_execution' and attempt == 0): - continue # Retry + continue - # If returncode is 0, extract review findings - if result.returncode == 0: - parsed_results = self._extract_review_findings(parsed_result) - return True, "", parsed_results - - # Handle non-zero return codes after parsing - if result.returncode != 0: - if attempt == NUM_RETRIES - 1: - error_details = f"Claude Code execution failed with return code {result.returncode}\n" - error_details += f"Stderr: {result.stderr}\n" - error_details += f"Stdout: {result.stdout[:500]}..." # First 500 chars - return False, error_details, {} - else: - time.sleep(5*attempt) - # Note: We don't do exponential backoff here to keep the runtime reasonable - continue # Retry + if isinstance(parsed_result, dict) and 'result' in parsed_result and isinstance(parsed_result['result'], str): + nested_success, nested = parse_json_with_fallbacks(parsed_result['result'], "Claude result text") + if nested_success: + return True, "", nested + return True, "", parsed_result - # Parse failed if attempt == 0: - continue # Retry once - else: - return False, "Failed to parse Claude output", {} - + continue + return False, "Failed to parse Claude output", {} + return False, "Unexpected error in retry logic", {} - except subprocess.TimeoutExpired: return False, f"Claude Code execution timed out after {self.timeout_seconds // 60} minutes", {} except Exception as e: return False, f"Claude Code execution error: {str(e)}", {} + + def run_code_review(self, repo_dir: Path, prompt: str) -> Tuple[bool, str, Dict[str, Any]]: + """Run code review prompt and normalize to findings payload.""" + success, error_msg, parsed = self.run_prompt(repo_dir, prompt, model=DEFAULT_CLAUDE_MODEL) + if not success: + return False, error_msg, {} + if isinstance(parsed, dict) and 'findings' in parsed: + return True, "", parsed + return True, "", self._extract_review_findings(parsed) def _extract_review_findings(self, claude_output: Any) -> Dict[str, Any]: - """Extract review findings and PR summary from Claude's JSON response.""" + """Extract review findings from Claude's JSON response.""" if isinstance(claude_output, dict): - # Only accept Claude Code wrapper with result field - # Direct format without wrapper is not supported if 'result' in claude_output: result_text = claude_output['result'] if isinstance(result_text, str): # Try to extract JSON from the result text success, result_json = parse_json_with_fallbacks(result_text, "Claude result text") - if success and result_json and 'findings' in result_json and 'pr_summary' in result_json: + if success and result_json and 'findings' in result_json: return result_json - + # Return empty structure if no findings found return { 'findings': [], - 'pr_summary': {} + 'analysis_summary': { + 'files_reviewed': 0, + 'high_severity': 0, + 'medium_severity': 0, + 'low_severity': 0, + 'review_completed': False, + } } def validate_claude_available(self) -> Tuple[bool, str]: @@ -559,17 +475,14 @@ def initialize_findings_filter(custom_filtering_instructions: Optional[str] = No ConfigurationError: If filter initialization fails """ try: - # Check if we should use heuristic (pattern-based) filtering - use_heuristic_filtering = os.environ.get('ENABLE_HEURISTIC_FILTERING', 'true').lower() == 'true' - # Check if we should use Claude API filtering use_claude_filtering = os.environ.get('ENABLE_CLAUDE_FILTERING', 'false').lower() == 'true' api_key = os.environ.get('ANTHROPIC_API_KEY') - + if use_claude_filtering and api_key: # Use full filtering with Claude API return FindingsFilter( - use_hard_exclusions=use_heuristic_filtering, + use_hard_exclusions=True, use_claude_filtering=True, api_key=api_key, custom_filtering_instructions=custom_filtering_instructions @@ -577,13 +490,28 @@ def initialize_findings_filter(custom_filtering_instructions: Optional[str] = No else: # Fallback to filtering with hard rules only return FindingsFilter( - use_hard_exclusions=use_heuristic_filtering, + use_hard_exclusions=True, use_claude_filtering=False ) except Exception as e: raise ConfigurationError(f'Failed to initialize findings filter: {str(e)}') +def get_review_model_config() -> ReviewModelConfig: + """Resolve per-phase model configuration from environment.""" + return ReviewModelConfig.from_env(os.environ, DEFAULT_CLAUDE_MODEL) + + +def get_max_diff_lines() -> int: + """Return bounded inline diff budget in lines.""" + max_diff_lines_str = os.environ.get('MAX_DIFF_LINES', '5000') + try: + parsed = int(max_diff_lines_str) + except ValueError: + parsed = 5000 + return max(0, parsed) + + def run_code_review(claude_runner: SimpleClaudeRunner, prompt: str) -> Dict[str, Any]: """Run the code review with Claude Code. @@ -744,147 +672,52 @@ def main(): print(json.dumps({'error': f'Failed to fetch PR data: {str(e)}'})) sys.exit(EXIT_GENERAL_ERROR) - # Fetch PR comments and build review context - review_context = None - try: - pr_comments = github_client.get_pr_comments(repo_name, pr_number) - if pr_comments: - # Build threads: find bot comments, their replies, and reactions - bot_comment_threads = [] - for comment in pr_comments: - if is_bot_comment(comment): - # This is a bot comment (thread root) - reactions = github_client.get_comment_reactions(repo_name, comment['id']) - - # Find replies to this comment - replies = [ - c for c in pr_comments - if c.get('in_reply_to_id') == comment['id'] - ] - # Sort replies by creation time - replies.sort(key=lambda c: c.get('created_at', '')) - - bot_comment_threads.append({ - 'bot_comment': comment, - 'replies': replies, - 'reactions': reactions, - }) - - # Sort threads by bot comment creation time (oldest first) - bot_comment_threads.sort(key=lambda t: t['bot_comment'].get('created_at', '')) - - if bot_comment_threads: - review_context = format_pr_comments_for_prompt(bot_comment_threads) - if review_context: - logger.info(f"Fetched previous review context ({len(review_context)} chars)") - except Exception as e: - logger.warning(f"Failed to fetch review context (continuing without it): {e}") - review_context = None - - # Determine whether to embed diff or use agentic file reading - max_diff_lines_str = os.environ.get('MAX_DIFF_LINES', '5000') - try: - max_diff_lines = int(max_diff_lines_str) - except ValueError: - max_diff_lines = 5000 - + max_diff_lines = get_max_diff_lines() diff_line_count = len(pr_diff.splitlines()) - use_agentic_mode = max_diff_lines == 0 or diff_line_count > max_diff_lines - - if use_agentic_mode: - print(f"[Info] Using agentic file reading mode (diff has {diff_line_count} lines, threshold: {max_diff_lines})", file=sys.stderr) + if max_diff_lines and diff_line_count > max_diff_lines: + print(f"[Info] Hybrid mode with truncated inline diff ({max_diff_lines}/{diff_line_count} lines)", file=sys.stderr) else: - print(f"[Debug] Embedding diff in prompt ({diff_line_count} lines)", file=sys.stderr) + print(f"[Info] Hybrid mode with full inline diff ({diff_line_count} lines)", file=sys.stderr) # Get repo directory from environment or use current directory repo_path = os.environ.get('REPO_PATH') repo_dir = Path(repo_path) if repo_path else Path.cwd() + model_config = get_review_model_config() + orchestrator = ReviewOrchestrator( + claude_runner=claude_runner, + findings_filter=findings_filter, + github_client=github_client, + model_config=model_config, + max_diff_lines=max_diff_lines, + ) - def run_review(include_diff: bool): - prompt_text = get_unified_review_prompt( - pr_data, - pr_diff if include_diff else None, - include_diff=include_diff, - custom_review_instructions=custom_review_instructions, - custom_security_instructions=custom_security_instructions, - review_context=review_context, - ) - return claude_runner.run_code_review(repo_dir, prompt_text), len(prompt_text) - - all_findings = [] - pr_summary_from_review = {} - - try: - (success, error_msg, review_results), prompt_len = run_review(include_diff=not use_agentic_mode) - - # Fallback to agentic mode if prompt still too long - if not success and error_msg == "PROMPT_TOO_LONG": - print(f"[Info] Prompt too long ({prompt_len} chars), falling back to agentic mode", file=sys.stderr) - (success, error_msg, review_results), prompt_len = run_review(include_diff=False) - - if not success: - raise AuditError(f'Code review failed: {error_msg}') - - pr_summary_from_review = review_results.get('pr_summary', {}) - for finding in review_results.get('findings', []): - if isinstance(finding, dict): - # Set review_type based on category - category = finding.get('category', '').lower() - if category == 'security': - finding.setdefault('review_type', 'security') - else: - finding.setdefault('review_type', 'general') - all_findings.append(finding) - - except AuditError as e: - print(json.dumps({'error': f'Code review failed: {str(e)}'})) + success, review_results, review_error = orchestrator.run( + repo_dir=repo_dir, + pr_data=pr_data, + pr_diff=pr_diff, + custom_review_instructions=custom_review_instructions, + custom_security_instructions=custom_security_instructions, + ) + if not success: + print(json.dumps({'error': f'Code review failed: {review_error}'})) sys.exit(EXIT_GENERAL_ERROR) - # Filter findings to reduce false positives - original_findings = all_findings - - # Prepare PR context for better filtering - pr_context = { - 'repo_name': repo_name, - 'pr_number': pr_number, - 'title': pr_data.get('title', ''), - 'description': pr_data.get('body', '') - } - - # Apply findings filter (including final directory exclusion) - kept_findings, excluded_findings, analysis_summary = apply_findings_filter( - findings_filter, original_findings, pr_context, github_client - ) - - # Prepare output summary - def severity_counts(findings_list): - high = len([f for f in findings_list if isinstance(f, dict) and f.get('severity', '').upper() == 'HIGH']) - medium = len([f for f in findings_list if isinstance(f, dict) and f.get('severity', '').upper() == 'MEDIUM']) - low = len([f for f in findings_list if isinstance(f, dict) and f.get('severity', '').upper() == 'LOW']) - return high, medium, low - - high_count, medium_count, low_count = severity_counts(kept_findings) - - # Calculate files_reviewed from pr_summary.file_changes - files_reviewed = 0 - if isinstance(pr_summary_from_review, dict): - file_changes = pr_summary_from_review.get('file_changes', []) - if isinstance(file_changes, list): - # Count unique files from all 'files' arrays - all_files = set() - for entry in file_changes: - if isinstance(entry, dict): - files_list = entry.get('files', []) - if isinstance(files_list, list): - all_files.update(files_list) - files_reviewed = len(all_files) + findings = review_results.get('findings', []) + analysis_summary = review_results.get('analysis_summary', {}) + high_count = len([f for f in findings if isinstance(f, dict) and f.get('severity', '').upper() == 'HIGH']) + medium_count = len([f for f in findings if isinstance(f, dict) and f.get('severity', '').upper() == 'MEDIUM']) + low_count = len([f for f in findings if isinstance(f, dict) and f.get('severity', '').upper() == 'LOW']) + files_reviewed = analysis_summary.get('files_reviewed', pr_data.get('changed_files', 0)) + try: + files_reviewed = int(files_reviewed) + except (TypeError, ValueError): + files_reviewed = 0 # Prepare output output = { 'pr_number': pr_number, 'repo': repo_name, - 'pr_summary': pr_summary_from_review, - 'findings': kept_findings, + 'findings': findings, 'analysis_summary': { 'files_reviewed': files_reviewed, 'high_severity': high_count, @@ -892,20 +725,18 @@ def severity_counts(findings_list): 'low_severity': low_count, 'review_completed': True }, - 'filtering_summary': { - 'total_original_findings': len(original_findings), - 'excluded_findings': len(excluded_findings), - 'kept_findings': len(kept_findings), - 'filter_analysis': analysis_summary, - 'excluded_findings_details': excluded_findings # Include full details of what was filtered - } + 'filtering_summary': review_results.get('filtering_summary', { + 'total_original_findings': len(findings), + 'excluded_findings': 0, + 'kept_findings': len(findings), + }) } # Output JSON to stdout print(json.dumps(output, indent=2)) # Exit with appropriate code - high_severity_count = len([f for f in kept_findings if f.get('severity', '').upper() == 'HIGH']) + high_severity_count = len([f for f in findings if isinstance(f, dict) and f.get('severity', '').upper() == 'HIGH']) sys.exit(EXIT_GENERAL_ERROR if high_severity_count > 0 else EXIT_SUCCESS) except Exception as e: diff --git a/claudecode/prompts.py b/claudecode/prompts.py index 6786858..3ab8286 100644 --- a/claudecode/prompts.py +++ b/claudecode/prompts.py @@ -1,268 +1,317 @@ -"""Code review prompt templates.""" +"""Prompt templates for multi-phase PR review orchestration.""" +from typing import Any, Dict, List, Optional -def _format_files_changed(pr_data): + +def _format_files_changed(pr_data: Dict[str, Any]) -> str: """Format changed files for prompt context.""" - return "\n".join([f"- {f['filename']}" for f in pr_data['files']]) + files = pr_data.get("files", []) + return "\n".join([f"- {f.get('filename', 'unknown')}" for f in files]) + + +def _build_hybrid_diff_section(pr_diff: str, max_lines: int) -> str: + """Build a bounded inline diff section while requiring tool-based context reads.""" + if not pr_diff: + return "\nNo inline diff available. Use repository tools to inspect changed files.\n" + lines = pr_diff.splitlines() + if max_lines > 0 and len(lines) > max_lines: + shown = "\n".join(lines[:max_lines]) + truncated_note = ( + f"\n[Diff truncated to {max_lines} lines out of {len(lines)} total lines. " + "You MUST use repository tools to inspect full context and missing hunks.]" + ) + return f"\nINLINE DIFF ANCHOR (TRUNCATED):\n```diff\n{shown}\n```{truncated_note}\n" -def _build_diff_section(pr_diff, include_diff): - """Build prompt section for inline diff or agentic file reading.""" - if pr_diff and include_diff: - return f""" + return ( + "\nINLINE DIFF ANCHOR:\n" + f"```diff\n{pr_diff}\n```\n" + "Use this as a starting point only. You MUST validate findings with repository tool reads.\n" + ) -PR DIFF CONTENT: -``` -{pr_diff} -``` -Review the complete diff above. This contains all code changes in the PR. +def _base_context_block(pr_data: Dict[str, Any], pr_diff: str, max_diff_lines: int) -> str: + """Shared context block used across prompts.""" + files_changed = _format_files_changed(pr_data) + return f""" +PR CONTEXT: +- PR Number: {pr_data.get('number', 'unknown')} +- Title: {pr_data.get('title', 'unknown')} +- Author: {pr_data.get('user', 'unknown')} +- Repository: {pr_data.get('head', {}).get('repo', {}).get('full_name', 'unknown')} +- Files changed: {pr_data.get('changed_files', 0)} +- Lines added: {pr_data.get('additions', 0)} +- Lines deleted: {pr_data.get('deletions', 0)} +- PR body: {pr_data.get('body', '') or 'No description'} + +MODIFIED FILES: +{files_changed or '- None listed'} +{_build_hybrid_diff_section(pr_diff, max_diff_lines)} +MANDATORY CONTEXT VALIDATION RULES: +1. You MUST use repository tools to read each relevant changed file before finalizing findings. +2. For every finding, verify at least one additional contextual location (caller, callee, config, or sibling path). +3. Do not rely on inline diff alone, even when diff is fully present. """ - return """ -IMPORTANT - FILE READING INSTRUCTIONS: -You have access to the repository files. For each file listed above, use the Read tool to examine the changes. -Focus on the files that are most likely to contain issues based on the PR context. +def _findings_output_schema(extra_fields: str = "") -> str: + return f""" +OUTPUT JSON SCHEMA (exact keys): +{{ + "findings": [ + {{ + "file": "path/to/file.py", + "line": 42, + "severity": "HIGH|MEDIUM|LOW", + "category": "correctness|reliability|performance|maintainability|testing|security|compliance", + "title": "Short issue title", + "description": "What is wrong", + "impact": "Concrete failure mode or exploit path", + "recommendation": "Actionable fix", + "confidence": 0.93{extra_fields} + }} + ], + "analysis_summary": {{ + "files_reviewed": 0, + "high_severity": 0, + "medium_severity": 0, + "low_severity": 0, + "review_completed": true + }} +}} +""" -To review effectively: -1. Read each modified file to understand the current code -2. Look at surrounding code context when needed to understand the changes -3. Check related files if you need to understand dependencies or usage patterns + +def build_triage_prompt(pr_data: Dict[str, Any], pr_diff: str, max_diff_lines: int) -> str: + """Prompt for triage phase.""" + return f""" +You are the triage specialist for a pull request review. + +{_base_context_block(pr_data, pr_diff, max_diff_lines)} + +Decide whether review should be skipped. +Skip only if one of the following is true: +- PR is clearly trivial and cannot contain correctness/security/compliance risk +- PR is obviously generated deployment churn with no business logic changes +- There are no meaningful code changes in reviewed files + +Return JSON only: +{{ + "skip_review": false, + "reason": "short reason", + "risk_level": "low|medium|high" +}} """ -def get_unified_review_prompt( - pr_data, - pr_diff=None, - include_diff=True, - custom_review_instructions=None, - custom_security_instructions=None, - review_context=None, -): - """Generate unified code review + security prompt for Claude Code. - - This prompt covers both code quality (correctness, reliability, performance, - maintainability, testing) and security in a single pass. - - Args: - pr_data: PR data dictionary from GitHub API - pr_diff: Optional complete PR diff in unified format - include_diff: Whether to include the diff in the prompt (default: True) - custom_review_instructions: Optional custom review instructions to append - custom_security_instructions: Optional custom security instructions to append - review_context: Optional previous review context (bot findings and user replies) - - Returns: - Formatted prompt string - """ +def build_context_discovery_prompt(pr_data: Dict[str, Any], pr_diff: str, max_diff_lines: int) -> str: + """Prompt for context discovery phase.""" + return f""" +You are the repository context specialist. - files_changed = _format_files_changed(pr_data) - diff_section = _build_diff_section(pr_diff, include_diff) - - custom_review_section = "" - if custom_review_instructions: - custom_review_section = f"\n{custom_review_instructions}\n" - - custom_security_section = "" - if custom_security_instructions: - custom_security_section = f"\n{custom_security_instructions}\n" - - # Build PR description section - pr_description = pr_data.get('body', '').strip() if pr_data.get('body') else '' - pr_description_section = "" - if pr_description: - # Truncate very long descriptions - if len(pr_description) > 2000: - pr_description = pr_description[:2000] + "... (truncated)" - pr_description_section = f"\nPR Description:\n{pr_description}\n" - - # Build review context section (previous bot reviews and user replies) - review_context_section = "" - if review_context: - review_context_section = review_context +{_base_context_block(pr_data, pr_diff, max_diff_lines)} + +Tasks: +1. Find relevant CLAUDE.md files: root and those in changed-file parent paths. +2. Summarize PR intent and risky hotspots. +3. Identify top files for deep review. + +Return JSON only: +{{ + "claude_md_files": ["path/CLAUDE.md"], + "change_summary": "brief summary", + "hotspots": ["path/to/file"], + "priority_files": ["path/to/file"] +}} +""" + +def build_compliance_prompt( + pr_data: Dict[str, Any], + pr_diff: str, + max_diff_lines: int, + discovered_context: Dict[str, Any], +) -> str: + """Prompt for CLAUDE.md compliance analysis.""" + context_json = discovered_context or {} return f""" -You are a senior engineer conducting a comprehensive code review of GitHub PR #{pr_data['number']}: "{pr_data['title']}" +You are the CLAUDE.md compliance specialist. -CONTEXT: -- Repository: {pr_data.get('head', {}).get('repo', {}).get('full_name', 'unknown')} -- Author: {pr_data['user']} -- Files changed: {pr_data['changed_files']} -- Lines added: {pr_data['additions']} -- Lines deleted: {pr_data['deletions']} -{pr_description_section} -Files modified: -{files_changed}{diff_section}{review_context_section} - -OBJECTIVE: -Perform a focused, high-signal code review to identify HIGH-CONFIDENCE issues introduced by this PR. This covers both code quality (correctness, reliability, performance, maintainability, testing) AND security. Do not comment on pre-existing issues or purely stylistic preferences. - -CRITICAL INSTRUCTIONS: -1. MINIMIZE FALSE POSITIVES: Only flag issues where you're >80% confident they are real and impactful -2. AVOID NOISE: Skip style nits, subjective preferences, or low-impact suggestions -3. FOCUS ON IMPACT: Prioritize bugs, regressions, data loss, significant performance problems, or security vulnerabilities -4. SCOPE: Only evaluate code introduced or modified in this PR. Ignore unrelated existing issues - -CODE QUALITY CATEGORIES: - -**Correctness & Logic:** -- Incorrect business logic or wrong results -- Edge cases or null/empty handling regressions -- Incorrect error handling or missing validations leading to bad state -- Invariants broken by changes - -**Reliability & Resilience:** -- Concurrency or race conditions introduced by changes -- Resource leaks, timeouts, or missing retries in critical paths -- Partial failure handling or inconsistent state updates -- Idempotency or ordering issues - -**Performance & Scalability:** -- Algorithmic regressions in hot paths (O(n^2) where O(n) expected) -- N+1 query patterns -- Excessive synchronous I/O in latency-sensitive code -- Unbounded memory growth introduced by changes - -**Maintainability & Design:** -- Changes that significantly increase complexity or make future changes risky -- Tight coupling or unclear responsibility boundaries introduced -- Misleading APIs or brittle contracts - -**Testing & Observability:** -- Missing tests for high-risk changes -- Lack of logging/metrics around new critical behavior -- Flaky behavior due to nondeterministic changes -{custom_review_section} -SECURITY CATEGORIES: - -**Input Validation Vulnerabilities:** -- SQL injection via unsanitized user input -- Command injection in system calls or subprocesses -- XXE injection in XML parsing -- Template injection in templating engines -- NoSQL injection in database queries -- Path traversal in file operations - -**Authentication & Authorization Issues:** -- Authentication bypass logic -- Privilege escalation paths -- Session management flaws -- JWT token vulnerabilities -- Authorization logic bypasses - -**Crypto & Secrets Management:** -- Hardcoded API keys, passwords, or tokens -- Weak cryptographic algorithms or implementations -- Improper key storage or management -- Cryptographic randomness issues -- Certificate validation bypasses - -**Injection & Code Execution:** -- Remote code execution via deserialization -- Pickle injection in Python -- YAML deserialization vulnerabilities -- Eval injection in dynamic code execution -- XSS vulnerabilities in web applications (reflected, stored, DOM-based) - -**Data Exposure:** -- Sensitive data logging or storage -- PII handling violations -- API endpoint data leakage -- Debug information exposure -{custom_security_section} -EXCLUSIONS - DO NOT REPORT: -- Denial of Service (DOS) vulnerabilities or resource exhaustion attacks -- Secrets/credentials stored on disk (these are managed separately) -- Rate limiting concerns or service overload scenarios - -Additional notes: -- Even if something is only exploitable from the local network, it can still be a HIGH severity issue - -ANALYSIS METHODOLOGY: - -Phase 1 - Repository Context Research (Use file search tools): -- Identify existing patterns, conventions, and critical paths -- Understand data flow, invariants, and error handling expectations -- Look for established security frameworks and patterns - -Phase 2 - Comparative Analysis: -- Compare new changes to existing patterns and contracts -- Identify deviations that introduce risk, regressions, or security issues -- Look for inconsistent handling between similar code paths - -Phase 3 - Issue Assessment: -- Examine each modified file for code quality and security implications -- Trace data flow from inputs to sensitive operations -- Identify concurrency, state management, and injection risks - -REQUIRED OUTPUT FORMAT: - -You MUST output your findings as structured JSON with this exact schema: +{_base_context_block(pr_data, pr_diff, max_diff_lines)} +DISCOVERED CONTEXT: +{context_json} + +Focus exclusively on clear CLAUDE.md violations in changed code. Cite concrete violated rule text in each finding. +Reject ambiguous or preference-only claims. + +{_findings_output_schema(',\n "rule_reference": "path/to/CLAUDE.md#section"')} + +Return JSON only. +""" + + +def build_quality_prompt( + pr_data: Dict[str, Any], + pr_diff: str, + max_diff_lines: int, + discovered_context: Dict[str, Any], + custom_review_instructions: Optional[str] = None, +) -> str: + """Prompt for code quality analysis.""" + custom_block = f"\nCUSTOM QUALITY INSTRUCTIONS:\n{custom_review_instructions}\n" if custom_review_instructions else "" + return f""" +You are the code quality specialist. + +{_base_context_block(pr_data, pr_diff, max_diff_lines)} + +DISCOVERED CONTEXT: +{discovered_context or {}} + +Focus on high-signal issues only: +- correctness and logic defects +- reliability regressions +- significant performance regressions +- maintainability risks with concrete failure/bug potential +- testing gaps only when they block confidence for risky behavior + +Exclude style-only feedback and speculative concerns. +{custom_block} +{_findings_output_schema()} + +Return JSON only. +""" + + +def build_security_prompt( + pr_data: Dict[str, Any], + pr_diff: str, + max_diff_lines: int, + discovered_context: Dict[str, Any], + custom_security_instructions: Optional[str] = None, +) -> str: + """Prompt for security analysis with explicit exploitability criteria.""" + custom_block = ( + f"\nCUSTOM SECURITY INSTRUCTIONS:\n{custom_security_instructions}\n" + if custom_security_instructions + else "" + ) + + return f""" +You are the security specialist. + +{_base_context_block(pr_data, pr_diff, max_diff_lines)} + +DISCOVERED CONTEXT: +{discovered_context or {}} + +Security review scope: +- injection (SQL/command/template/NoSQL/path traversal) +- authn/authz bypass and privilege escalation +- unsafe deserialization and code execution paths +- crypto and secrets handling flaws +- sensitive data exposure and trust-boundary breaks + +For every security finding you MUST provide: +1. exploit or abuse path +2. required attacker preconditions +3. impacted trust boundary or sensitive asset +4. concrete mitigation + +Do NOT report: +- generic DoS/rate-limiting comments without concrete exploitability +- speculative attacks without evidence in changed code paths +- issues outside modified scope unless required to prove exploitability +{custom_block} +{_findings_output_schema(',\n "exploit_preconditions": "...",\n "trust_boundary": "...",\n "cwe": "optional CWE-###"')} + +Return JSON only. +""" + + +def build_validation_prompt( + pr_data: Dict[str, Any], + pr_diff: str, + max_diff_lines: int, + candidate_findings: List[Dict[str, Any]], +) -> str: + """Prompt for finding validation and deduplication support.""" + return f""" +You are the validation specialist. + +{_base_context_block(pr_data, pr_diff, max_diff_lines)} + +CANDIDATE FINDINGS: +{candidate_findings} + +Validate each finding with strict criteria: +- must be reproducible or clearly inferable from changed code and surrounding context +- must have concrete impact +- confidence must be >= 0.8 to keep +- if two findings are duplicates, keep the stronger one only + +Return JSON only: {{ - "pr_summary": {{ - "overview": "2-4 sentence summary of what this PR changes and why it matters", - "file_changes": [ - {{ - "label": "src/auth.py", - "files": ["src/auth.py"], - "changes": "Brief description of changes (~10 words)" - }}, - {{ - "label": "tests/test_*.py", - "files": ["tests/test_auth.py", "tests/test_login.py"], - "changes": "Unit tests for authentication" - }} - ] - }}, - "findings": [ + "validated_findings": [ {{ - "file": "path/to/file.py", - "line": 42, - "severity": "HIGH", - "category": "correctness|reliability|performance|maintainability|testing|security", - "title": "Short summary of the issue", - "description": "What is wrong and where it happens", - "impact": "Concrete impact or failure mode (use exploit scenario for security issues)", - "recommendation": "Actionable fix or mitigation", - "suggestion": "Exact replacement code (optional). Can be multi-line. Must replace lines from suggestion_start_line to suggestion_end_line.", - "suggestion_start_line": 42, - "suggestion_end_line": 44, - "confidence": 0.95 + "finding_index": 0, + "keep": true, + "confidence": 0.92, + "reason": "short reason" }} ] }} - -PR SUMMARY GUIDELINES: -- overview: 2-4 sentences describing WHAT changed and WHY (purpose/goal) -- file_changes: One entry per file or group of related files - - label: Display name (single file path, or pattern like "tests/*.py" for groups) - - files: Array of actual file paths covered by this entry (used for counting) - - changes: Brief description (~10 words), focus on purpose not implementation - - Group related files when it improves readability (e.g., test files, config files) - -SUGGESTION GUIDELINES: -- Only include `suggestion` if you can provide exact, working replacement code -- For single-line fixes: set suggestion_start_line = suggestion_end_line = the line number -- For multi-line fixes: set the range of lines being replaced -- The suggestion replaces all lines from suggestion_start_line to suggestion_end_line (inclusive) - -SEVERITY GUIDELINES: -- **HIGH**: Likely production bug, data loss, significant regression, or directly exploitable security vulnerability -- **MEDIUM**: Real issue with limited scope or specific triggering conditions -- **LOW**: Minor but real issue; use sparingly and only if clearly actionable - -CONFIDENCE SCORING: -- 0.9-1.0: Certain issue with clear evidence and impact -- 0.8-0.9: Strong signal with likely real-world impact -- 0.7-0.8: Plausible issue but may require specific conditions -- Below 0.7: Don't report (too speculative) - -FINAL REMINDER: -Focus on HIGH and MEDIUM findings only. Better to miss some theoretical issues than flood the report with false positives. Each finding should be something a senior engineer would confidently raise in a PR review. - -Begin your analysis now. Use the repository exploration tools to understand the codebase context, then analyze the PR changes for code quality and security implications. - -Your final reply must contain the JSON and nothing else. You should not reply again after outputting the JSON. """ + + +def get_unified_review_prompt( + pr_data: Dict[str, Any], + pr_diff: Optional[str] = None, + include_diff: bool = True, + custom_review_instructions: Optional[str] = None, + custom_security_instructions: Optional[str] = None, +) -> str: + """Backward-compatible unified prompt used by tests and direct calls. + + The unified prompt now enforces hybrid behavior: even when diff is included, + repository context reads are still mandatory. + """ + diff_text = pr_diff if include_diff else "" + max_lines = len(diff_text.splitlines()) if diff_text else 0 + combined = build_quality_prompt( + pr_data, + diff_text, + max_lines, + discovered_context={}, + custom_review_instructions=custom_review_instructions, + ) + security = build_security_prompt( + pr_data, + diff_text, + max_lines, + discovered_context={}, + custom_security_instructions=custom_security_instructions, + ) + + file_reading_block = ( + "\nIMPORTANT - FILE READING INSTRUCTIONS:\n" + "You MUST read changed files and nearby context with repository tools before final findings.\n" + ) + if include_diff and diff_text: + diff_anchor = f"\nPR DIFF CONTENT:\n```diff\n{diff_text}\n```\n" + else: + diff_anchor = "\n" + + return ( + f"You are a senior engineer conducting a comprehensive code review of GitHub PR #{pr_data.get('number', 'unknown')}.\n" + "CONTEXT:\n" + f"- Title: {pr_data.get('title', 'unknown')}\n" + "OBJECTIVE:\n" + "Perform a high-signal code quality and security review with mandatory repository context validation.\n" + + diff_anchor + + file_reading_block + + "\nREQUIRED OUTPUT FORMAT:\nJSON only.\n\n" + + combined + + "\n\n" + + security + ) diff --git a/claudecode/review_orchestrator.py b/claudecode/review_orchestrator.py new file mode 100644 index 0000000..79590b8 --- /dev/null +++ b/claudecode/review_orchestrator.py @@ -0,0 +1,293 @@ +"""Multi-phase review orchestration for GitHub Action execution.""" + +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple + +from claudecode.findings_merge import merge_findings +from claudecode.json_parser import parse_json_with_fallbacks +from claudecode.logger import get_logger +from claudecode.prompts import ( + build_compliance_prompt, + build_context_discovery_prompt, + build_quality_prompt, + build_security_prompt, + build_triage_prompt, + build_validation_prompt, +) + +logger = get_logger(__name__) + + +@dataclass +class ReviewModelConfig: + """Per-phase model configuration with global fallback.""" + + triage: str + compliance: str + quality: str + security: str + validation: str + + @classmethod + def from_env(cls, env: Dict[str, str], default_model: str) -> "ReviewModelConfig": + def resolve(key: str, fallback: str) -> str: + value = (env.get(key) or "").strip() + return value or fallback + + global_model = resolve("CLAUDE_MODEL", default_model) + return cls( + triage=resolve("MODEL_TRIAGE", global_model), + compliance=resolve("MODEL_COMPLIANCE", global_model), + quality=resolve("MODEL_QUALITY", global_model), + security=resolve("MODEL_SECURITY", global_model), + validation=resolve("MODEL_VALIDATION", global_model), + ) + + +class ReviewOrchestrator: + """Coordinates multi-phase review and returns final findings.""" + + def __init__( + self, + claude_runner: Any, + findings_filter: Any, + github_client: Any, + model_config: ReviewModelConfig, + max_diff_lines: int, + ): + self.claude_runner = claude_runner + self.findings_filter = findings_filter + self.github_client = github_client + self.model_config = model_config + self.max_diff_lines = max(0, max_diff_lines) + + def _run_phase(self, repo_dir: Path, prompt: str, model: str, phase_name: str) -> Tuple[bool, Dict[str, Any], str]: + raw_result = None + if hasattr(self.claude_runner, "run_prompt"): + raw_result = self.claude_runner.run_prompt(repo_dir, prompt, model=model) + if not (isinstance(raw_result, tuple) and len(raw_result) == 3) and hasattr(self.claude_runner, "run_code_review"): + raw_result = self.claude_runner.run_code_review(repo_dir, prompt) + if not (isinstance(raw_result, tuple) and len(raw_result) == 3): + return False, {}, f"Invalid runner response for {phase_name}" + + success, error_msg, raw = raw_result + if not success: + return False, {}, error_msg + + if isinstance(raw, dict): + return True, raw, "" + + parsed_ok, parsed = parse_json_with_fallbacks(str(raw), f"{phase_name} output") + if not parsed_ok: + return False, {}, f"Failed to parse {phase_name} output" + return True, parsed, "" + + def _collect_phase_findings(self, phase_result: Dict[str, Any], source_agent: str) -> List[Dict[str, Any]]: + findings: List[Dict[str, Any]] = [] + for finding in phase_result.get("findings", []): + if isinstance(finding, dict): + enriched = finding.copy() + enriched.setdefault("source_agent", source_agent) + category = str(enriched.get("category", "")).lower() + if "review_type" not in enriched: + enriched["review_type"] = "security" if category == "security" else "general" + findings.append(enriched) + return findings + + def _ensure_review_type(self, finding: Dict[str, Any]) -> Dict[str, Any]: + enriched = finding.copy() + category = str(enriched.get("category", "")).lower() + if "review_type" not in enriched: + enriched["review_type"] = "security" if category == "security" else "general" + return enriched + + def run( + self, + repo_dir: Path, + pr_data: Dict[str, Any], + pr_diff: str, + custom_review_instructions: Optional[str] = None, + custom_security_instructions: Optional[str] = None, + ) -> Tuple[bool, Dict[str, Any], str]: + # Phase A: triage + triage_prompt = build_triage_prompt(pr_data, pr_diff, self.max_diff_lines) + ok, triage_result, err = self._run_phase(repo_dir, triage_prompt, self.model_config.triage, "triage") + if not ok: + return False, {}, f"Triage phase failed: {err}" + + # Backward-compatible fast path for legacy single-pass outputs. + if isinstance(triage_result, dict) and "findings" in triage_result: + original_count = len([f for f in triage_result.get("findings", []) if isinstance(f, dict)]) + legacy_findings = [] + for finding in triage_result.get("findings", []): + if isinstance(finding, dict) and not self.github_client._is_excluded(finding.get("file", "")): + legacy_findings.append(self._ensure_review_type(finding)) + pr_context = { + "repo_name": pr_data.get("head", {}).get("repo", {}).get("full_name", "unknown"), + "pr_number": pr_data.get("number"), + "title": pr_data.get("title", ""), + "description": pr_data.get("body", ""), + } + filter_response = self.findings_filter.filter_findings(legacy_findings, pr_context) + if isinstance(filter_response, tuple) and len(filter_response) == 3: + filter_success, filter_results, _ = filter_response + if filter_success and isinstance(filter_results, dict): + legacy_findings = filter_results.get("filtered_findings", legacy_findings) + legacy_findings = [self._ensure_review_type(f) for f in legacy_findings if isinstance(f, dict)] + high = len([f for f in legacy_findings if str(f.get("severity", "")).upper() == "HIGH"]) + medium = len([f for f in legacy_findings if str(f.get("severity", "")).upper() == "MEDIUM"]) + low = len([f for f in legacy_findings if str(f.get("severity", "")).upper() == "LOW"]) + return True, { + "findings": legacy_findings, + "analysis_summary": { + "files_reviewed": triage_result.get("analysis_summary", {}).get("files_reviewed", pr_data.get("changed_files", 0)), + "high_severity": high, + "medium_severity": medium, + "low_severity": low, + "review_completed": True, + }, + "filtering_summary": { + "total_original_findings": original_count, + "excluded_findings": max(0, original_count - len(legacy_findings)), + "kept_findings": len(legacy_findings), + }, + "triage": {"skip_review": False, "reason": "legacy_single_pass", "risk_level": "medium"}, + }, "" + + if triage_result.get("skip_review") is True: + logger.info("Skipping review based on triage decision: %s", triage_result.get("reason", "")) + return True, { + "findings": [], + "analysis_summary": { + "files_reviewed": 0, + "high_severity": 0, + "medium_severity": 0, + "low_severity": 0, + "review_completed": True, + }, + "triage": triage_result, + }, "" + + # Phase B: context discovery + context_prompt = build_context_discovery_prompt(pr_data, pr_diff, self.max_diff_lines) + ok, context_result, err = self._run_phase(repo_dir, context_prompt, self.model_config.compliance, "context discovery") + if not ok: + return False, {}, f"Context discovery phase failed: {err}" + + # Phase C: specialist passes + compliance_prompt = build_compliance_prompt(pr_data, pr_diff, self.max_diff_lines, context_result) + quality_prompt = build_quality_prompt( + pr_data, + pr_diff, + self.max_diff_lines, + context_result, + custom_review_instructions=custom_review_instructions, + ) + security_prompt = build_security_prompt( + pr_data, + pr_diff, + self.max_diff_lines, + context_result, + custom_security_instructions=custom_security_instructions, + ) + + ok_c, compliance_result, err_c = self._run_phase(repo_dir, compliance_prompt, self.model_config.compliance, "compliance") + ok_q, quality_result, err_q = self._run_phase(repo_dir, quality_prompt, self.model_config.quality, "quality") + ok_s, security_result, err_s = self._run_phase(repo_dir, security_prompt, self.model_config.security, "security") + + if not ok_c: + return False, {}, f"Compliance phase failed: {err_c}" + if not ok_q: + return False, {}, f"Quality phase failed: {err_q}" + if not ok_s: + return False, {}, f"Security phase failed: {err_s}" + + all_candidates: List[Dict[str, Any]] = [] + all_candidates.extend(self._collect_phase_findings(compliance_result, "compliance")) + all_candidates.extend(self._collect_phase_findings(quality_result, "quality")) + all_candidates.extend(self._collect_phase_findings(security_result, "security")) + + all_candidates = merge_findings(all_candidates) + + # Phase D: validation + validation_prompt = build_validation_prompt(pr_data, pr_diff, self.max_diff_lines, all_candidates) + ok_v, validation_result, err_v = self._run_phase(repo_dir, validation_prompt, self.model_config.validation, "validation") + if not ok_v: + return False, {}, f"Validation phase failed: {err_v}" + + validated: List[Dict[str, Any]] = [] + decisions = validation_result.get("validated_findings", []) + for decision in decisions: + if not isinstance(decision, dict): + continue + idx = decision.get("finding_index") + keep = bool(decision.get("keep")) + confidence = decision.get("confidence", 0.0) + try: + idx_int = int(idx) + except (TypeError, ValueError): + continue + if idx_int < 0 or idx_int >= len(all_candidates): + continue + if keep and float(confidence or 0.0) >= 0.8: + finding = all_candidates[idx_int].copy() + finding["confidence"] = float(confidence) + validated.append(finding) + + # Fallback: keep merged findings if validation returned no decisions + if decisions and not validated: + validated = [] + elif not decisions: + validated = all_candidates + + # Apply existing filtering pipeline + pr_context = { + "repo_name": pr_data.get("head", {}).get("repo", {}).get("full_name", "unknown"), + "pr_number": pr_data.get("number"), + "title": pr_data.get("title", ""), + "description": pr_data.get("body", ""), + } + kept_findings = validated + original_count = len(validated) + filter_response = self.findings_filter.filter_findings(validated, pr_context) + if isinstance(filter_response, tuple) and len(filter_response) == 3: + filter_success, filter_results, _ = filter_response + if filter_success and isinstance(filter_results, dict): + kept_findings = filter_results.get("filtered_findings", validated) + + final_findings: List[Dict[str, Any]] = [] + for finding in kept_findings: + if not isinstance(finding, dict): + continue + if self.github_client._is_excluded(finding.get("file", "")): + continue + final_findings.append(self._ensure_review_type(finding)) + + high = len([f for f in final_findings if str(f.get("severity", "")).upper() == "HIGH"]) + medium = len([f for f in final_findings if str(f.get("severity", "")).upper() == "MEDIUM"]) + low = len([f for f in final_findings if str(f.get("severity", "")).upper() == "LOW"]) + + files_reviewed = pr_data.get("changed_files", 0) + try: + files_reviewed = int(files_reviewed) + except (TypeError, ValueError): + files_reviewed = 0 + + return True, { + "findings": final_findings, + "analysis_summary": { + "files_reviewed": files_reviewed, + "high_severity": high, + "medium_severity": medium, + "low_severity": low, + "review_completed": True, + }, + "filtering_summary": { + "total_original_findings": original_count, + "excluded_findings": max(0, original_count - len(final_findings)), + "kept_findings": len(final_findings), + }, + "context_summary": context_result, + "triage": triage_result, + }, "" From c4e9c301c61391d981763e5049512b4cfb2a8afb Mon Sep 17 00:00:00 2001 From: Matej Bukovinski Date: Mon, 9 Feb 2026 13:36:47 +0100 Subject: [PATCH 02/12] Fix prompt f-string compatibility for Python 3.10 CI --- claudecode/prompts.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/claudecode/prompts.py b/claudecode/prompts.py index 3ab8286..87dccf8 100644 --- a/claudecode/prompts.py +++ b/claudecode/prompts.py @@ -2,6 +2,12 @@ from typing import Any, Dict, List, Optional +COMPLIANCE_EXTRA_FIELDS = ',\n "rule_reference": "path/to/CLAUDE.md#section"' +SECURITY_EXTRA_FIELDS = ( + ',\n "exploit_preconditions": "...",\n "trust_boundary": "...",\n' + ' "cwe": "optional CWE-###"' +) + def _format_files_changed(pr_data: Dict[str, Any]) -> str: """Format changed files for prompt context.""" @@ -145,7 +151,7 @@ def build_compliance_prompt( Focus exclusively on clear CLAUDE.md violations in changed code. Cite concrete violated rule text in each finding. Reject ambiguous or preference-only claims. -{_findings_output_schema(',\n "rule_reference": "path/to/CLAUDE.md#section"')} + {_findings_output_schema(COMPLIANCE_EXTRA_FIELDS)} Return JSON only. """ @@ -223,7 +229,7 @@ def build_security_prompt( - speculative attacks without evidence in changed code paths - issues outside modified scope unless required to prove exploitability {custom_block} -{_findings_output_schema(',\n "exploit_preconditions": "...",\n "trust_boundary": "...",\n "cwe": "optional CWE-###"')} + {_findings_output_schema(SECURITY_EXTRA_FIELDS)} Return JSON only. """ From c014c457ff7fbfc3ae01d162dacebafbb635bc7c Mon Sep 17 00:00:00 2001 From: Matej Bukovinski Date: Tue, 10 Feb 2026 12:29:56 +0100 Subject: [PATCH 03/12] Address PR review feedback on model routing and validation logic --- claudecode/github_action_audit.py | 6 +++--- claudecode/review_orchestrator.py | 18 ++++++++++++------ 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/claudecode/github_action_audit.py b/claudecode/github_action_audit.py index b46ae3e..7501e9a 100644 --- a/claudecode/github_action_audit.py +++ b/claudecode/github_action_audit.py @@ -314,7 +314,7 @@ def run_prompt(self, repo_dir: Path, prompt: str, model: Optional[str] = None) - error_details += f"Stderr: {result.stderr}\n" error_details += f"Stdout: {result.stdout[:500]}..." return False, error_details, {} - time.sleep(5 * attempt) + time.sleep(5 * (attempt + 1)) continue success, parsed_result = parse_json_with_fallbacks(result.stdout, "Claude Code output") @@ -348,9 +348,9 @@ def run_prompt(self, repo_dir: Path, prompt: str, model: Optional[str] = None) - except Exception as e: return False, f"Claude Code execution error: {str(e)}", {} - def run_code_review(self, repo_dir: Path, prompt: str) -> Tuple[bool, str, Dict[str, Any]]: + def run_code_review(self, repo_dir: Path, prompt: str, model: Optional[str] = None) -> Tuple[bool, str, Dict[str, Any]]: """Run code review prompt and normalize to findings payload.""" - success, error_msg, parsed = self.run_prompt(repo_dir, prompt, model=DEFAULT_CLAUDE_MODEL) + success, error_msg, parsed = self.run_prompt(repo_dir, prompt, model=model or DEFAULT_CLAUDE_MODEL) if not success: return False, error_msg, {} if isinstance(parsed, dict) and 'findings' in parsed: diff --git a/claudecode/review_orchestrator.py b/claudecode/review_orchestrator.py index 79590b8..19a6dc3 100644 --- a/claudecode/review_orchestrator.py +++ b/claudecode/review_orchestrator.py @@ -67,7 +67,11 @@ def _run_phase(self, repo_dir: Path, prompt: str, model: str, phase_name: str) - if hasattr(self.claude_runner, "run_prompt"): raw_result = self.claude_runner.run_prompt(repo_dir, prompt, model=model) if not (isinstance(raw_result, tuple) and len(raw_result) == 3) and hasattr(self.claude_runner, "run_code_review"): - raw_result = self.claude_runner.run_code_review(repo_dir, prompt) + try: + raw_result = self.claude_runner.run_code_review(repo_dir, prompt, model=model) + except TypeError: + # Backward compatibility for legacy mocks/runners that don't accept model parameter. + raw_result = self.claude_runner.run_code_review(repo_dir, prompt) if not (isinstance(raw_result, tuple) and len(raw_result) == 3): return False, {}, f"Invalid runner response for {phase_name}" @@ -217,7 +221,10 @@ def run( return False, {}, f"Validation phase failed: {err_v}" validated: List[Dict[str, Any]] = [] - decisions = validation_result.get("validated_findings", []) + has_validation_output = isinstance(validation_result, dict) and "validated_findings" in validation_result + decisions = validation_result.get("validated_findings", []) if isinstance(validation_result, dict) else [] + if not isinstance(decisions, list): + decisions = [] for decision in decisions: if not isinstance(decision, dict): continue @@ -235,10 +242,9 @@ def run( finding["confidence"] = float(confidence) validated.append(finding) - # Fallback: keep merged findings if validation returned no decisions - if decisions and not validated: - validated = [] - elif not decisions: + # If validator did not return decisions at all, preserve candidates. + # If it explicitly returned validated_findings (including empty list), trust validator output. + if not has_validation_output: validated = all_candidates # Apply existing filtering pipeline From 37a1142d7113d0944a0a27a7e1fe8e525000dd76 Mon Sep 17 00:00:00 2001 From: Matej Bukovinski Date: Tue, 10 Feb 2026 12:35:10 +0100 Subject: [PATCH 04/12] Use latest Claude model aliases for phase defaults --- .claude/agents/compliance.md | 2 +- .claude/agents/quality.md | 2 +- .claude/agents/security.md | 2 +- .claude/agents/triage.md | 2 +- .claude/agents/validator.md | 2 +- README.md | 12 ++++++------ action.yml | 12 ++++++------ claudecode/claude_api_client.py | 2 +- claudecode/constants.py | 3 +-- 9 files changed, 19 insertions(+), 20 deletions(-) diff --git a/.claude/agents/compliance.md b/.claude/agents/compliance.md index 463e14f..262021c 100644 --- a/.claude/agents/compliance.md +++ b/.claude/agents/compliance.md @@ -1,6 +1,6 @@ --- name: compliance -model: claude-sonnet-4-20250514 +model: claude-sonnet-4-5 description: CLAUDE.md compliance specialist --- diff --git a/.claude/agents/quality.md b/.claude/agents/quality.md index e1bf26b..9b64144 100644 --- a/.claude/agents/quality.md +++ b/.claude/agents/quality.md @@ -1,6 +1,6 @@ --- name: quality -model: claude-opus-4-5-20251101 +model: claude-opus-4-6 description: Code quality specialist for correctness and reliability --- diff --git a/.claude/agents/security.md b/.claude/agents/security.md index ea521cd..2295274 100644 --- a/.claude/agents/security.md +++ b/.claude/agents/security.md @@ -1,6 +1,6 @@ --- name: security -model: claude-opus-4-5-20251101 +model: claude-opus-4-6 description: Security specialist for exploitable vulnerabilities --- diff --git a/.claude/agents/triage.md b/.claude/agents/triage.md index 33591ba..69fa84c 100644 --- a/.claude/agents/triage.md +++ b/.claude/agents/triage.md @@ -1,6 +1,6 @@ --- name: triage -model: claude-3-5-haiku-20241022 +model: claude-haiku-4-5 description: Fast PR triage for skip/continue decisions --- diff --git a/.claude/agents/validator.md b/.claude/agents/validator.md index 8235ee9..1d69298 100644 --- a/.claude/agents/validator.md +++ b/.claude/agents/validator.md @@ -1,6 +1,6 @@ --- name: validator -model: claude-sonnet-4-20250514 +model: claude-sonnet-4-5 description: Finding validation and deduplication specialist --- diff --git a/README.md b/README.md index 06285d3..2c36f21 100644 --- a/README.md +++ b/README.md @@ -111,12 +111,12 @@ This action is not hardened against prompt injection attacks and should only be | `comment-pr` | Whether to comment on PRs with findings | `true` | No | | `upload-results` | Whether to upload results as artifacts | `true` | No | | `exclude-directories` | Comma-separated list of directories to exclude from scanning | None | No | -| `claude-model` | Claude [model name](https://docs.anthropic.com/en/docs/about-claude/models/overview#model-names) to use. Defaults to Opus 4.5. | `claude-opus-4-5-20251101` | No | -| `model-triage` | Model used for triage phase (skip/continue decision). | `claude-3-5-haiku-20241022` | No | -| `model-compliance` | Model used for CLAUDE.md compliance phase. | `claude-sonnet-4-20250514` | No | -| `model-quality` | Model used for code quality phase. | `claude-opus-4-5-20251101` | No | -| `model-security` | Model used for security phase. | `claude-opus-4-5-20251101` | No | -| `model-validation` | Model used for finding validation phase. | `claude-sonnet-4-20250514` | No | +| `claude-model` | Claude [model name](https://docs.anthropic.com/en/docs/about-claude/models/overview#model-names) to use. Defaults to Opus 4.6. | `claude-opus-4-6` | No | +| `model-triage` | Model used for triage phase (skip/continue decision). | `claude-haiku-4-5` | No | +| `model-compliance` | Model used for CLAUDE.md compliance phase. | `claude-sonnet-4-5` | No | +| `model-quality` | Model used for code quality phase. | `claude-opus-4-6` | No | +| `model-security` | Model used for security phase. | `claude-opus-4-6` | No | +| `model-validation` | Model used for finding validation phase. | `claude-sonnet-4-5` | No | | `claudecode-timeout` | Timeout for ClaudeCode analysis in minutes | `20` | No | | `run-every-commit` | Run ClaudeCode on every commit (skips cache check). Warning: May increase false positives on PRs with many commits. **Deprecated**: Use `trigger-on-commit` instead. | `false` | No | | `trigger-on-open` | Run review when PR is first opened | `true` | No | diff --git a/action.yml b/action.yml index 8f0485e..a5708a5 100644 --- a/action.yml +++ b/action.yml @@ -29,34 +29,34 @@ inputs: default: '' claude-model: - description: 'Claude model to use for code review analysis (e.g., claude-sonnet-4-20250514)' + description: 'Claude model to use for code review analysis (e.g., claude-sonnet-4-5)' required: false default: '' model-triage: description: 'Model for triage phase' required: false - default: 'claude-3-5-haiku-20241022' + default: 'claude-haiku-4-5' model-compliance: description: 'Model for CLAUDE.md compliance phase' required: false - default: 'claude-sonnet-4-20250514' + default: 'claude-sonnet-4-5' model-quality: description: 'Model for code quality phase' required: false - default: 'claude-opus-4-5-20251101' + default: 'claude-opus-4-6' model-security: description: 'Model for security phase' required: false - default: 'claude-opus-4-5-20251101' + default: 'claude-opus-4-6' model-validation: description: 'Model for validation phase' required: false - default: 'claude-sonnet-4-20250514' + default: 'claude-sonnet-4-5' run-every-commit: description: 'DEPRECATED: Use trigger-on-commit instead. Run ClaudeCode on every commit (skips cache check). Warning: This may lead to more false positives on PRs with many commits as the AI analyzes the same code multiple times.' diff --git a/claudecode/claude_api_client.py b/claudecode/claude_api_client.py index 5504b93..849f733 100644 --- a/claudecode/claude_api_client.py +++ b/claudecode/claude_api_client.py @@ -59,7 +59,7 @@ def validate_api_access(self) -> Tuple[bool, str]: try: # Simple test call to verify API access self.client.messages.create( - model="claude-3-5-haiku-20241022", + model="claude-haiku-4-5", max_tokens=10, messages=[{"role": "user", "content": "Hello"}], timeout=10 diff --git a/claudecode/constants.py b/claudecode/constants.py index bbf7988..6c81257 100644 --- a/claudecode/constants.py +++ b/claudecode/constants.py @@ -5,7 +5,7 @@ import os # API Configuration -DEFAULT_CLAUDE_MODEL = os.environ.get('CLAUDE_MODEL') or 'claude-opus-4-5-20251101' +DEFAULT_CLAUDE_MODEL = os.environ.get('CLAUDE_MODEL') or 'claude-opus-4-6' DEFAULT_TIMEOUT_SECONDS = 180 # 3 minutes DEFAULT_MAX_RETRIES = 3 RATE_LIMIT_BACKOFF_MAX = 30 # Maximum backoff time for rate limits @@ -20,4 +20,3 @@ # Subprocess Configuration SUBPROCESS_TIMEOUT = 1200 # 20 minutes for Claude Code execution - From 373748825586d39e8b4237bf20dc17eeb4916c6f Mon Sep 17 00:00:00 2001 From: Matej Bukovinski Date: Tue, 10 Feb 2026 12:39:53 +0100 Subject: [PATCH 05/12] Dismiss stale action reviews reliably with review marker --- scripts/comment-pr-findings.js | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/comment-pr-findings.js b/scripts/comment-pr-findings.js index 270f98b..0f212df 100755 --- a/scripts/comment-pr-findings.js +++ b/scripts/comment-pr-findings.js @@ -6,6 +6,7 @@ const fs = require('fs'); const { spawnSync } = require('child_process'); +const REVIEW_MARKER = ''; // PR Summary marker for identifying our summary sections const PR_SUMMARY_MARKER = '📋 **PR Summary:**'; @@ -102,6 +103,7 @@ function addReactionsToReview(reviewId) { // Check if a review was posted by this action function isOwnReview(review) { if (!review.body) return false; + if (review.body.includes(REVIEW_MARKER)) return true; // Check for our review summary patterns const ownPatterns = [ @@ -135,8 +137,8 @@ function findExistingReview() { for (const review of reviews) { const isDismissible = review.state === 'APPROVED' || review.state === 'CHANGES_REQUESTED'; - const isBot = review.user && review.user.type === 'Bot'; const isOwn = isOwnReview(review); + const isBot = review.user && review.user.type === 'Bot'; if (isBot && isDismissible && isOwn) { return review; @@ -293,7 +295,7 @@ async function run() { const highSeverityCount = analysisSummary.high_severity || 0; const reviewEvent = highSeverityCount > 0 ? 'REQUEST_CHANGES' : 'APPROVE'; - const reviewBody = buildReviewSummary(newFindings, prSummary, analysisSummary); + const reviewBody = `${buildReviewSummary(newFindings, prSummary, analysisSummary)}\n\n${REVIEW_MARKER}`; // Prepare review comments const reviewComments = []; From 1ac0c46f7d52d6851765064f6c2d6c0207eb2fde Mon Sep 17 00:00:00 2001 From: Matej Bukovinski Date: Tue, 10 Feb 2026 12:42:24 +0100 Subject: [PATCH 06/12] Use review marker only for stale review dismissal --- scripts/comment-pr-findings.bun.test.js | 2 +- scripts/comment-pr-findings.js | 22 +--------------------- 2 files changed, 2 insertions(+), 22 deletions(-) diff --git a/scripts/comment-pr-findings.bun.test.js b/scripts/comment-pr-findings.bun.test.js index e5dae90..987adb1 100644 --- a/scripts/comment-pr-findings.bun.test.js +++ b/scripts/comment-pr-findings.bun.test.js @@ -1026,7 +1026,7 @@ describe('comment-pr-findings.js', () => { id: 201, state: 'APPROVED', user: { type: 'Bot' }, - body: '📋 **PR Summary:**\nThis PR adds auth.\n\n---\n\nNo issues found. Changes look good.' + body: '\nNo issues found. Changes look good.' } ]; diff --git a/scripts/comment-pr-findings.js b/scripts/comment-pr-findings.js index 0f212df..0dbd4ee 100755 --- a/scripts/comment-pr-findings.js +++ b/scripts/comment-pr-findings.js @@ -103,27 +103,7 @@ function addReactionsToReview(reviewId) { // Check if a review was posted by this action function isOwnReview(review) { if (!review.body) return false; - if (review.body.includes(REVIEW_MARKER)) return true; - - // Check for our review summary patterns - const ownPatterns = [ - PR_SUMMARY_MARKER, - 'No issues found. Changes look good.', - /^Found \d+ .+ issues?\./, - 'Please address the high-severity issues before merging.', - 'Consider addressing the suggestions in the comments.', - 'Minor suggestions noted in comments.' - ]; - - for (const pattern of ownPatterns) { - if (pattern instanceof RegExp) { - if (pattern.test(review.body)) return true; - } else { - if (review.body.includes(pattern)) return true; - } - } - - return false; + return review.body.includes(REVIEW_MARKER); } // Find an existing review posted by this action From 80f1dcf6e46ba511d21258431a87b2f04dfb7f9e Mon Sep 17 00:00:00 2001 From: Matej Bukovinski Date: Fri, 13 Feb 2026 09:55:33 +0100 Subject: [PATCH 07/12] Address remaining PR #21 review feedback --- claudecode/github_action_audit.py | 6 ++- claudecode/prompts.py | 5 +++ claudecode/review_orchestrator.py | 18 ++++++--- claudecode/test_findings_merge.py | 32 +++++++++++++++ claudecode/test_prompts.py | 10 ++++- claudecode/test_review_orchestrator.py | 55 ++++++++++++++++++++++++++ 6 files changed, 119 insertions(+), 7 deletions(-) create mode 100644 claudecode/test_findings_merge.py create mode 100644 claudecode/test_review_orchestrator.py diff --git a/claudecode/github_action_audit.py b/claudecode/github_action_audit.py index 7501e9a..f98c394 100644 --- a/claudecode/github_action_audit.py +++ b/claudecode/github_action_audit.py @@ -234,6 +234,10 @@ def _is_excluded(self, filepath: str) -> bool: return True return False + + def is_excluded(self, filepath: str) -> bool: + """Public wrapper for exclusion checks.""" + return self._is_excluded(filepath) def _filter_generated_files(self, diff_text: str) -> str: """Filter out generated files and excluded directories from diff content.""" @@ -329,7 +333,7 @@ def run_prompt(self, repo_dir: Path, prompt: str, model: Optional[str] = None) - if (isinstance(parsed_result, dict) and parsed_result.get('type') == 'result' and parsed_result.get('subtype') == 'error_during_execution' and - attempt == 0): + attempt < NUM_RETRIES - 1): continue if isinstance(parsed_result, dict) and 'result' in parsed_result and isinstance(parsed_result['result'], str): diff --git a/claudecode/prompts.py b/claudecode/prompts.py index 87dccf8..2fac032 100644 --- a/claudecode/prompts.py +++ b/claudecode/prompts.py @@ -19,6 +19,11 @@ def _build_hybrid_diff_section(pr_diff: str, max_lines: int) -> str: """Build a bounded inline diff section while requiring tool-based context reads.""" if not pr_diff: return "\nNo inline diff available. Use repository tools to inspect changed files.\n" + if max_lines == 0: + return ( + "\nInline diff intentionally omitted (max-diff-lines=0). " + "Use repository tools to inspect changed files and context.\n" + ) lines = pr_diff.splitlines() if max_lines > 0 and len(lines) > max_lines: diff --git a/claudecode/review_orchestrator.py b/claudecode/review_orchestrator.py index 19a6dc3..4256b69 100644 --- a/claudecode/review_orchestrator.py +++ b/claudecode/review_orchestrator.py @@ -64,9 +64,10 @@ def __init__( def _run_phase(self, repo_dir: Path, prompt: str, model: str, phase_name: str) -> Tuple[bool, Dict[str, Any], str]: raw_result = None - if hasattr(self.claude_runner, "run_prompt"): + supports_run_prompt = callable(getattr(type(self.claude_runner), "run_prompt", None)) + if supports_run_prompt: raw_result = self.claude_runner.run_prompt(repo_dir, prompt, model=model) - if not (isinstance(raw_result, tuple) and len(raw_result) == 3) and hasattr(self.claude_runner, "run_code_review"): + elif hasattr(self.claude_runner, "run_code_review"): try: raw_result = self.claude_runner.run_code_review(repo_dir, prompt, model=model) except TypeError: @@ -87,6 +88,13 @@ def _run_phase(self, repo_dir: Path, prompt: str, model: str, phase_name: str) - return False, {}, f"Failed to parse {phase_name} output" return True, parsed, "" + def _is_excluded(self, filepath: str) -> bool: + checker = getattr(self.github_client, "is_excluded", None) + if callable(checker): + return bool(checker(filepath)) + # Fallback for current client/tests that still expose private method. + return bool(self.github_client._is_excluded(filepath)) + def _collect_phase_findings(self, phase_result: Dict[str, Any], source_agent: str) -> List[Dict[str, Any]]: findings: List[Dict[str, Any]] = [] for finding in phase_result.get("findings", []): @@ -125,7 +133,7 @@ def run( original_count = len([f for f in triage_result.get("findings", []) if isinstance(f, dict)]) legacy_findings = [] for finding in triage_result.get("findings", []): - if isinstance(finding, dict) and not self.github_client._is_excluded(finding.get("file", "")): + if isinstance(finding, dict) and not self._is_excluded(finding.get("file", "")): legacy_findings.append(self._ensure_review_type(finding)) pr_context = { "repo_name": pr_data.get("head", {}).get("repo", {}).get("full_name", "unknown"), @@ -255,7 +263,7 @@ def run( "description": pr_data.get("body", ""), } kept_findings = validated - original_count = len(validated) + original_count = len(all_candidates) filter_response = self.findings_filter.filter_findings(validated, pr_context) if isinstance(filter_response, tuple) and len(filter_response) == 3: filter_success, filter_results, _ = filter_response @@ -266,7 +274,7 @@ def run( for finding in kept_findings: if not isinstance(finding, dict): continue - if self.github_client._is_excluded(finding.get("file", "")): + if self._is_excluded(finding.get("file", "")): continue final_findings.append(self._ensure_review_type(finding)) diff --git a/claudecode/test_findings_merge.py b/claudecode/test_findings_merge.py new file mode 100644 index 0000000..9129e7a --- /dev/null +++ b/claudecode/test_findings_merge.py @@ -0,0 +1,32 @@ +"""Unit tests for findings_merge utilities.""" + +from claudecode.findings_merge import merge_findings + + +def test_merge_findings_empty_input(): + assert merge_findings([]) == [] + + +def test_merge_findings_keeps_higher_severity_then_confidence(): + findings = [ + {"file": "a.py", "line": 10, "category": "security", "title": "Issue", "severity": "MEDIUM", "confidence": 0.9}, + {"file": "a.py", "line": 10, "category": "security", "title": "Issue", "severity": "HIGH", "confidence": 0.8}, + {"file": "a.py", "line": 10, "category": "security", "title": "Issue", "severity": "HIGH", "confidence": 0.95}, + ] + + merged = merge_findings(findings) + assert len(merged) == 1 + assert merged[0]["severity"] == "HIGH" + assert merged[0]["confidence"] == 0.95 + + +def test_merge_findings_separate_keys_are_preserved(): + findings = [ + {"file": "a.py", "line": 1, "category": "correctness", "title": "One", "severity": "LOW"}, + {"file": "a.py", "line": 2, "category": "correctness", "title": "Two", "severity": "LOW"}, + {"file": "b.py", "line": 1, "category": "security", "title": "One", "severity": "MEDIUM"}, + ] + + merged = merge_findings(findings) + assert len(merged) == 3 + diff --git a/claudecode/test_prompts.py b/claudecode/test_prompts.py index dbb9d4f..2e9ed52 100644 --- a/claudecode/test_prompts.py +++ b/claudecode/test_prompts.py @@ -192,4 +192,12 @@ def test_get_unified_review_prompt_includes_summary_guidelines(self): assert "PR SUMMARY GUIDELINES:" in prompt assert "2-4 sentences" in prompt - assert "~10 words" in prompt \ No newline at end of file + assert "~10 words" in prompt + + def test_build_hybrid_diff_section_max_lines_zero_omits_inline_diff(self): + from claudecode.prompts import _build_hybrid_diff_section + + section = _build_hybrid_diff_section("diff --git a/a.py b/a.py\n+print('x')", 0) + + assert "intentionally omitted" in section + assert "```diff" not in section diff --git a/claudecode/test_review_orchestrator.py b/claudecode/test_review_orchestrator.py new file mode 100644 index 0000000..e28bbc5 --- /dev/null +++ b/claudecode/test_review_orchestrator.py @@ -0,0 +1,55 @@ +"""Unit tests for review_orchestrator.""" + +from pathlib import Path +from unittest.mock import Mock + +from claudecode.review_orchestrator import ReviewModelConfig, ReviewOrchestrator + + +def test_review_model_config_from_env_with_fallbacks(): + env = { + "CLAUDE_MODEL": "global-model", + "MODEL_TRIAGE": "triage-model", + "MODEL_SECURITY": "security-model", + } + cfg = ReviewModelConfig.from_env(env, "default-model") + assert cfg.triage == "triage-model" + assert cfg.compliance == "global-model" + assert cfg.quality == "global-model" + assert cfg.security == "security-model" + assert cfg.validation == "global-model" + + +def test_orchestrator_legacy_single_pass_path(): + runner = Mock() + runner.run_code_review.return_value = ( + True, + "", + { + "findings": [ + {"file": "a.py", "line": 3, "severity": "HIGH", "category": "security", "title": "Issue"} + ], + "analysis_summary": {"files_reviewed": 1}, + }, + ) + findings_filter = Mock() + findings_filter.filter_findings.return_value = ( + True, + {"filtered_findings": runner.run_code_review.return_value[2]["findings"]}, + Mock(), + ) + github_client = Mock() + github_client._is_excluded.return_value = False + cfg = ReviewModelConfig("t", "c", "q", "s", "v") + + orchestrator = ReviewOrchestrator(runner, findings_filter, github_client, cfg, 100) + ok, result, err = orchestrator.run( + repo_dir=Path("."), + pr_data={"number": 1, "changed_files": 1, "head": {"repo": {"full_name": "x/y"}}}, + pr_diff="diff --git a/a.py b/a.py", + ) + + assert ok is True + assert err == "" + assert len(result["findings"]) == 1 + assert result["findings"][0]["review_type"] == "security" From df71d9f8a57fb616d296415aa958a308a17033ea Mon Sep 17 00:00:00 2001 From: Matej Bukovinski Date: Fri, 13 Feb 2026 10:26:20 +0100 Subject: [PATCH 08/12] Restore mainline compatibility after rebase --- claudecode/github_action_audit.py | 86 ++++++++++++++- claudecode/prompts.py | 137 ++++++++++++++++-------- scripts/comment-pr-findings.bun.test.js | 4 +- 3 files changed, 182 insertions(+), 45 deletions(-) diff --git a/claudecode/github_action_audit.py b/claudecode/github_action_audit.py index f98c394..8decdd5 100644 --- a/claudecode/github_action_audit.py +++ b/claudecode/github_action_audit.py @@ -17,6 +17,7 @@ # Import existing components we can reuse from claudecode.findings_filter import FindingsFilter from claudecode.json_parser import parse_json_with_fallbacks +from claudecode.format_pr_comments import format_pr_comments_for_prompt, is_bot_comment from claudecode.prompts import get_unified_review_prompt # Backward-compatible import for tests/extensions. from claudecode.review_orchestrator import ReviewModelConfig, ReviewOrchestrator from claudecode.constants import ( @@ -27,6 +28,7 @@ SUBPROCESS_TIMEOUT ) from claudecode.logger import get_logger +from claudecode.review_schema import REVIEW_OUTPUT_SCHEMA logger = get_logger(__name__) @@ -201,6 +203,56 @@ def get_pr_diff(self, repo_name: str, pr_number: int) -> str: response.raise_for_status() return self._filter_generated_files(response.text) + + def get_pr_comments(self, repo_name: str, pr_number: int) -> List[Dict[str, Any]]: + """Get all review comments for a PR with pagination.""" + all_comments = [] + page = 1 + per_page = 100 + + while True: + url = f"https://api.github.com/repos/{repo_name}/pulls/{pr_number}/comments" + params = {'per_page': per_page, 'page': page} + + try: + response = requests.get(url, headers=self.headers, params=params) + response.raise_for_status() + comments = response.json() + + if not comments: + break + + all_comments.extend(comments) + if len(comments) < per_page: + break + page += 1 + except requests.RequestException as e: + logger.warning(f"Failed to fetch comments page {page}: {e}") + break + + return all_comments + + def get_comment_reactions(self, repo_name: str, comment_id: int) -> Dict[str, int]: + """Get reactions for a specific comment, excluding bot reactions.""" + url = f"https://api.github.com/repos/{repo_name}/pulls/comments/{comment_id}/reactions" + + try: + response = requests.get(url, headers=self.headers) + response.raise_for_status() + reactions = response.json() + + counts: Dict[str, int] = {} + for reaction in reactions: + user = reaction.get('user', {}) + if user.get('type') == 'Bot': + continue + content = reaction.get('content', '') + if content: + counts[content] = counts.get(content, 0) + 1 + return counts + except requests.RequestException as e: + logger.debug(f"Failed to fetch reactions for comment {comment_id}: {e}") + return {} def _is_excluded(self, filepath: str) -> bool: """Check if a file should be excluded based on directory or file patterns.""" @@ -298,7 +350,8 @@ def run_prompt(self, repo_dir: Path, prompt: str, model: Optional[str] = None) - 'claude', '--output-format', 'json', '--model', model_name, - '--disallowed-tools', 'Bash(ps:*)' + '--disallowed-tools', 'Bash(ps:*)', + '--json-schema', json.dumps(REVIEW_OUTPUT_SCHEMA), ] NUM_RETRIES = 3 @@ -370,11 +423,14 @@ def _extract_review_findings(self, claude_output: Any) -> Dict[str, Any]: # Try to extract JSON from the result text success, result_json = parse_json_with_fallbacks(result_text, "Claude result text") if success and result_json and 'findings' in result_json: + if 'pr_summary' not in result_json: + result_json['pr_summary'] = {} return result_json # Return empty structure if no findings found return { 'findings': [], + 'pr_summary': {}, 'analysis_summary': { 'files_reviewed': 0, 'high_severity': 0, @@ -676,6 +732,34 @@ def main(): print(json.dumps({'error': f'Failed to fetch PR data: {str(e)}'})) sys.exit(EXIT_GENERAL_ERROR) + # Backward-compatible context collection for review thread history. + review_context = None + try: + pr_comments = github_client.get_pr_comments(repo_name, pr_number) + if pr_comments: + bot_comment_threads = [] + for comment in pr_comments: + if is_bot_comment(comment): + reactions = github_client.get_comment_reactions(repo_name, comment['id']) + replies = [ + c for c in pr_comments + if c.get('in_reply_to_id') == comment['id'] + ] + replies.sort(key=lambda c: c.get('created_at', '')) + bot_comment_threads.append({ + 'bot_comment': comment, + 'replies': replies, + 'reactions': reactions, + }) + bot_comment_threads.sort(key=lambda t: t['bot_comment'].get('created_at', '')) + if bot_comment_threads: + review_context = format_pr_comments_for_prompt(bot_comment_threads) + if review_context: + logger.info(f"Fetched previous review context ({len(review_context)} chars)") + except Exception as e: + logger.warning(f"Failed to fetch review context (continuing without it): {e}") + review_context = None + max_diff_lines = get_max_diff_lines() diff_line_count = len(pr_diff.splitlines()) if max_diff_lines and diff_line_count > max_diff_lines: diff --git a/claudecode/prompts.py b/claudecode/prompts.py index 2fac032..9ae3f2e 100644 --- a/claudecode/prompts.py +++ b/claudecode/prompts.py @@ -40,6 +40,31 @@ def _build_hybrid_diff_section(pr_diff: str, max_lines: int) -> str: "Use this as a starting point only. You MUST validate findings with repository tool reads.\n" ) +def _build_diff_section(pr_diff: Optional[str], include_diff: bool) -> str: + """Build unified-prompt diff section for backward compatibility.""" + if pr_diff and include_diff: + return f""" + +PR DIFF CONTENT: +``` +{pr_diff} +``` + +Review the complete diff above. This contains all code changes in the PR. +""" + + return """ + +IMPORTANT - FILE READING INSTRUCTIONS: +You have access to the repository files. For each file listed above, use the Read tool to examine the changes. +Focus on the files that are most likely to contain issues based on the PR context. + +To review effectively: +1. Read each modified file to understand the current code +2. Look at surrounding code context when needed to understand the changes +3. Check related files if you need to understand dependencies or usage patterns +""" + def _base_context_block(pr_data: Dict[str, Any], pr_diff: str, max_diff_lines: int) -> str: """Shared context block used across prompts.""" @@ -281,48 +306,76 @@ def get_unified_review_prompt( include_diff: bool = True, custom_review_instructions: Optional[str] = None, custom_security_instructions: Optional[str] = None, + review_context: Optional[str] = None, ) -> str: - """Backward-compatible unified prompt used by tests and direct calls. - - The unified prompt now enforces hybrid behavior: even when diff is included, - repository context reads are still mandatory. - """ - diff_text = pr_diff if include_diff else "" - max_lines = len(diff_text.splitlines()) if diff_text else 0 - combined = build_quality_prompt( - pr_data, - diff_text, - max_lines, - discovered_context={}, - custom_review_instructions=custom_review_instructions, - ) - security = build_security_prompt( - pr_data, - diff_text, - max_lines, - discovered_context={}, - custom_security_instructions=custom_security_instructions, - ) + """Backward-compatible unified prompt used by tests and direct calls.""" + files_changed = _format_files_changed(pr_data) + diff_section = _build_diff_section(pr_diff, include_diff) + custom_review_section = f"\n{custom_review_instructions}\n" if custom_review_instructions else "" + custom_security_section = f"\n{custom_security_instructions}\n" if custom_security_instructions else "" - file_reading_block = ( - "\nIMPORTANT - FILE READING INSTRUCTIONS:\n" - "You MUST read changed files and nearby context with repository tools before final findings.\n" - ) - if include_diff and diff_text: - diff_anchor = f"\nPR DIFF CONTENT:\n```diff\n{diff_text}\n```\n" - else: - diff_anchor = "\n" + pr_description = (pr_data.get('body', '') or '').strip() + pr_description_section = "" + if pr_description: + if len(pr_description) > 2000: + pr_description = pr_description[:2000] + "... (truncated)" + pr_description_section = f"\nPR Description:\n{pr_description}\n" - return ( - f"You are a senior engineer conducting a comprehensive code review of GitHub PR #{pr_data.get('number', 'unknown')}.\n" - "CONTEXT:\n" - f"- Title: {pr_data.get('title', 'unknown')}\n" - "OBJECTIVE:\n" - "Perform a high-signal code quality and security review with mandatory repository context validation.\n" - + diff_anchor - + file_reading_block - + "\nREQUIRED OUTPUT FORMAT:\nJSON only.\n\n" - + combined - + "\n\n" - + security - ) + review_context_section = review_context or "" + + return f""" +You are a senior engineer conducting a comprehensive code review of GitHub PR #{pr_data.get('number', 'unknown')}: "{pr_data.get('title', 'unknown')}" + +CONTEXT: +- Repository: {pr_data.get('head', {}).get('repo', {}).get('full_name', 'unknown')} +- Author: {pr_data.get('user', 'unknown')} +- Files changed: {pr_data.get('changed_files', 0)} +- Lines added: {pr_data.get('additions', 0)} +- Lines deleted: {pr_data.get('deletions', 0)} +{pr_description_section} +Files modified: +{files_changed}{diff_section}{review_context_section} + +OBJECTIVE: +Perform a focused, high-signal code review to identify HIGH-CONFIDENCE issues introduced by this PR. + +CODE QUALITY CATEGORIES: +- correctness, reliability, performance, maintainability, testing +{custom_review_section} +SECURITY CATEGORIES: +- input validation, authn/authz, crypto/secrets, code execution, data exposure +{custom_security_section} + +REQUIRED OUTPUT FORMAT: + +{{ + "pr_summary": {{ + "overview": "2-4 sentence summary of what this PR changes and why it matters", + "file_changes": [ + {{ + "label": "src/auth.py", + "files": ["src/auth.py"], + "changes": "Brief description of changes (~10 words)" + }} + ] + }}, + "findings": [ + {{ + "file": "path/to/file.py", + "line": 42, + "severity": "HIGH|MEDIUM|LOW", + "category": "correctness|reliability|performance|maintainability|testing|security", + "title": "Short summary of the issue", + "description": "What is wrong and where it happens", + "impact": "Concrete impact or failure mode", + "recommendation": "Actionable fix or mitigation", + "confidence": 0.95 + }} + ] +}} + +PR SUMMARY GUIDELINES: +- overview: 2-4 sentences describing WHAT changed and WHY (purpose/goal) +- file_changes: One entry per file or group of related files +- changes: Brief description (~10 words), focus on purpose not implementation +""" diff --git a/scripts/comment-pr-findings.bun.test.js b/scripts/comment-pr-findings.bun.test.js index 987adb1..d28878c 100644 --- a/scripts/comment-pr-findings.bun.test.js +++ b/scripts/comment-pr-findings.bun.test.js @@ -1076,7 +1076,7 @@ describe('comment-pr-findings.js', () => { test('should update existing review in place when state is unchanged and no inline comments', async () => { // Existing APPROVED review, new findings also result in APPROVED (no HIGH severity) const mockReviews = [ - { id: 101, state: 'APPROVED', user: { type: 'Bot' }, body: 'No issues found. Changes look good.' } + { id: 101, state: 'APPROVED', user: { type: 'Bot' }, body: '\nNo issues found. Changes look good.' } ]; readFileSyncSpy.mockImplementation((path) => { @@ -1129,7 +1129,7 @@ describe('comment-pr-findings.js', () => { test('should dismiss and create new review when state changes', async () => { // Existing APPROVED review, but new findings have HIGH severity = CHANGES_REQUESTED const mockReviews = [ - { id: 101, state: 'APPROVED', user: { type: 'Bot' }, body: 'No issues found. Changes look good.' } + { id: 101, state: 'APPROVED', user: { type: 'Bot' }, body: '\nNo issues found. Changes look good.' } ]; const mockFindings = [{ From df89a9379323961b55d541500266f199c762b211 Mon Sep 17 00:00:00 2001 From: Matej Bukovinski Date: Fri, 13 Feb 2026 12:57:37 +0100 Subject: [PATCH 09/12] Expand gitignore and ignore OS/editor artifacts --- .gitignore | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 8b8dd8e..dd3b326 100644 --- a/.gitignore +++ b/.gitignore @@ -1,11 +1,22 @@ +# OS-generated files +.DS_Store +Thumbs.db + # Cache directories .cache/ +.pytest_cache/ # Python __pycache__/ *.py[cod] *$py.class *.pyc +.python-version +.mypy_cache/ +.ruff_cache/ +.coverage +.coverage.* +htmlcov/ # Output files *.csv @@ -21,4 +32,17 @@ env/ claudecode/claudecode-prompt.txt eval_results/ -.env \ No newline at end of file +.env +.env.* + +# Editor / IDE +.idea/ +.vscode/ +*.swp +*.swo + +# Node / Bun +node_modules/ + +# Logs +*.log From 66fdece73c5bcbb5fe631cb007c3c9f1f1f88738 Mon Sep 17 00:00:00 2001 From: Matej Bukovinski Date: Fri, 13 Feb 2026 13:09:37 +0100 Subject: [PATCH 10/12] Remove remaining single-pass compatibility paths --- README.md | 2 +- claudecode/review_orchestrator.py | 55 +++---------------- claudecode/test_main_function.py | 71 ++++++++++++------------- claudecode/test_review_orchestrator.py | 25 +++------ claudecode/test_workflow_integration.py | 71 +++++++++++-------------- 5 files changed, 81 insertions(+), 143 deletions(-) diff --git a/README.md b/README.md index 2c36f21..b55c320 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Nutrient Code Reviewer -An AI-powered code review GitHub Action using Claude to analyze code changes. Uses a unified multi-agent approach for both code quality (correctness, reliability, performance, maintainability, testing) and security in a single pass. This action provides intelligent, context-aware review for pull requests using Anthropic's Claude Code tool for deep semantic analysis. +An AI-powered code review GitHub Action using Claude to analyze code changes. Uses a unified multi-agent, multi-phase approach for both code quality (correctness, reliability, performance, maintainability, testing) and security. This action provides intelligent, context-aware review for pull requests using Anthropic's Claude Code tool for deep semantic analysis. Based on the original work from [anthropics/claude-code-security-review](https://github.com/anthropics/claude-code-security-review). diff --git a/claudecode/review_orchestrator.py b/claudecode/review_orchestrator.py index 4256b69..8f70bac 100644 --- a/claudecode/review_orchestrator.py +++ b/claudecode/review_orchestrator.py @@ -63,16 +63,11 @@ def __init__( self.max_diff_lines = max(0, max_diff_lines) def _run_phase(self, repo_dir: Path, prompt: str, model: str, phase_name: str) -> Tuple[bool, Dict[str, Any], str]: - raw_result = None - supports_run_prompt = callable(getattr(type(self.claude_runner), "run_prompt", None)) - if supports_run_prompt: - raw_result = self.claude_runner.run_prompt(repo_dir, prompt, model=model) - elif hasattr(self.claude_runner, "run_code_review"): - try: - raw_result = self.claude_runner.run_code_review(repo_dir, prompt, model=model) - except TypeError: - # Backward compatibility for legacy mocks/runners that don't accept model parameter. - raw_result = self.claude_runner.run_code_review(repo_dir, prompt) + run_prompt = getattr(self.claude_runner, "run_prompt", None) + if not callable(run_prompt): + return False, {}, f"Runner missing run_prompt for {phase_name}" + + raw_result = run_prompt(repo_dir, prompt, model=model) if not (isinstance(raw_result, tuple) and len(raw_result) == 3): return False, {}, f"Invalid runner response for {phase_name}" @@ -128,44 +123,8 @@ def run( if not ok: return False, {}, f"Triage phase failed: {err}" - # Backward-compatible fast path for legacy single-pass outputs. - if isinstance(triage_result, dict) and "findings" in triage_result: - original_count = len([f for f in triage_result.get("findings", []) if isinstance(f, dict)]) - legacy_findings = [] - for finding in triage_result.get("findings", []): - if isinstance(finding, dict) and not self._is_excluded(finding.get("file", "")): - legacy_findings.append(self._ensure_review_type(finding)) - pr_context = { - "repo_name": pr_data.get("head", {}).get("repo", {}).get("full_name", "unknown"), - "pr_number": pr_data.get("number"), - "title": pr_data.get("title", ""), - "description": pr_data.get("body", ""), - } - filter_response = self.findings_filter.filter_findings(legacy_findings, pr_context) - if isinstance(filter_response, tuple) and len(filter_response) == 3: - filter_success, filter_results, _ = filter_response - if filter_success and isinstance(filter_results, dict): - legacy_findings = filter_results.get("filtered_findings", legacy_findings) - legacy_findings = [self._ensure_review_type(f) for f in legacy_findings if isinstance(f, dict)] - high = len([f for f in legacy_findings if str(f.get("severity", "")).upper() == "HIGH"]) - medium = len([f for f in legacy_findings if str(f.get("severity", "")).upper() == "MEDIUM"]) - low = len([f for f in legacy_findings if str(f.get("severity", "")).upper() == "LOW"]) - return True, { - "findings": legacy_findings, - "analysis_summary": { - "files_reviewed": triage_result.get("analysis_summary", {}).get("files_reviewed", pr_data.get("changed_files", 0)), - "high_severity": high, - "medium_severity": medium, - "low_severity": low, - "review_completed": True, - }, - "filtering_summary": { - "total_original_findings": original_count, - "excluded_findings": max(0, original_count - len(legacy_findings)), - "kept_findings": len(legacy_findings), - }, - "triage": {"skip_review": False, "reason": "legacy_single_pass", "risk_level": "medium"}, - }, "" + if not isinstance(triage_result, dict) or "skip_review" not in triage_result: + return False, {}, "Triage phase returned invalid schema" if triage_result.get("skip_review") is True: logger.info("Skipping review based on triage decision: %s", triage_result.get("reason", "")) diff --git a/claudecode/test_main_function.py b/claudecode/test_main_function.py index 0b2bdae..f5239d2 100644 --- a/claudecode/test_main_function.py +++ b/claudecode/test_main_function.py @@ -12,6 +12,28 @@ from claudecode.github_action_audit import main +def _mock_multiphase_success(mock_runner, findings=None): + """Configure runner mock for strict multi-phase orchestration.""" + findings = findings or [] + mock_runner.run_prompt.side_effect = [ + (True, "", {"skip_review": False, "reason": "", "risk_level": "medium"}), # triage + (True, "", {"claude_md_files": [], "change_summary": "", "hotspots": [], "priority_files": []}), # context + (True, "", {"findings": findings}), # compliance + (True, "", {"findings": []}), # quality + (True, "", {"findings": []}), # security + ( + True, + "", + { + "validated_findings": [ + {"finding_index": idx, "keep": True, "confidence": 0.95, "reason": "valid"} + for idx in range(len(findings)) + ] + }, + ), # validation + ] + + class TestMainFunction: """Test main function execution flow.""" @@ -232,20 +254,7 @@ def test_main_successful_audit_no_findings(self, mock_client_class, mock_runner_ mock_runner = Mock() mock_runner.validate_claude_available.return_value = (True, "") - # Unified review - single call - mock_runner.run_code_review.return_value = ( - True, - "", - { - 'findings': [], - 'analysis_summary': { - 'files_reviewed': 5, - 'high_severity': 0, - 'medium_severity': 0, - 'low_severity': 0 - } - } - ) + _mock_multiphase_success(mock_runner, findings=[]) mock_runner_class.return_value = mock_runner mock_filter = Mock() @@ -299,6 +308,7 @@ def test_main_successful_audit_with_findings(self, mock_client_class, mock_runne } mock_client.get_pr_diff.return_value = "diff content" mock_client._is_excluded.return_value = False # Don't exclude any files in tests + mock_client.is_excluded.return_value = False mock_client_class.return_value = mock_client findings = [ @@ -320,20 +330,7 @@ def test_main_successful_audit_with_findings(self, mock_client_class, mock_runne mock_runner = Mock() mock_runner.validate_claude_available.return_value = (True, "") - # Unified review - single call with all findings - mock_runner.run_code_review.return_value = ( - True, - "", - { - 'findings': findings, - 'analysis_summary': { - 'files_reviewed': 2, - 'high_severity': 1, - 'medium_severity': 1, - 'low_severity': 0 - } - } - ) + _mock_multiphase_success(mock_runner, findings=findings) mock_runner_class.return_value = mock_runner # Filter keeps only high severity @@ -391,13 +388,14 @@ def test_main_with_full_filter(self, mock_client_class, mock_runner_class, } mock_client.get_pr_diff.return_value = "diff content" mock_client._is_excluded.return_value = False # Don't exclude any files in tests + mock_client.is_excluded.return_value = False mock_client_class.return_value = mock_client findings = [{'file': 'test.py', 'line': 10, 'severity': 'HIGH', 'description': 'Issue'}] mock_runner = Mock() mock_runner.validate_claude_available.return_value = (True, "") - mock_runner.run_code_review.return_value = (True, "", {'findings': findings}) + _mock_multiphase_success(mock_runner, findings=findings) mock_runner_class.return_value = mock_runner mock_prompt_func.return_value = "prompt" @@ -450,6 +448,7 @@ def test_main_filter_failure_keeps_all_findings(self, mock_client_class, mock_ru mock_client.get_pr_data.return_value = {'number': 123, 'title': 'Test', 'body': ''} mock_client.get_pr_diff.return_value = "diff" mock_client._is_excluded.return_value = False # Don't exclude any files in tests + mock_client.is_excluded.return_value = False mock_client_class.return_value = mock_client findings = [ @@ -459,7 +458,7 @@ def test_main_filter_failure_keeps_all_findings(self, mock_client_class, mock_ru mock_runner = Mock() mock_runner.validate_claude_available.return_value = (True, "") - mock_runner.run_code_review.return_value = (True, "", {'findings': findings}) + _mock_multiphase_success(mock_runner, findings=findings) mock_runner_class.return_value = mock_runner mock_prompt_func.return_value = "prompt" @@ -539,11 +538,7 @@ def test_audit_failure(self, mock_client_class, mock_runner_class, mock_runner = Mock() mock_runner.validate_claude_available.return_value = (True, "") - mock_runner.run_code_review.return_value = ( - False, - "Claude execution failed", - {} - ) + mock_runner.run_prompt.return_value = (False, "Claude execution failed", {}) mock_runner_class.return_value = mock_runner mock_filter_class.return_value = Mock() @@ -582,6 +577,7 @@ def test_findings_have_correct_review_type(self, mock_client_class, mock_runner_ mock_client.get_pr_data.return_value = {'number': 123, 'title': 'Test', 'body': ''} mock_client.get_pr_diff.return_value = "diff" mock_client._is_excluded.return_value = False + mock_client.is_excluded.return_value = False mock_client_class.return_value = mock_client # Mixed findings - security and non-security @@ -592,7 +588,7 @@ def test_findings_have_correct_review_type(self, mock_client_class, mock_runner_ mock_runner = Mock() mock_runner.validate_claude_available.return_value = (True, "") - mock_runner.run_code_review.return_value = (True, "", {'findings': findings}) + _mock_multiphase_success(mock_runner, findings=findings) mock_runner_class.return_value = mock_runner # Filter passes through all findings @@ -644,6 +640,7 @@ def test_review_type_not_overwritten_if_already_set(self, mock_client_class, moc mock_client.get_pr_data.return_value = {'number': 123, 'title': 'Test', 'body': ''} mock_client.get_pr_diff.return_value = "diff" mock_client._is_excluded.return_value = False + mock_client.is_excluded.return_value = False mock_client_class.return_value = mock_client # Finding already has review_type set (e.g., from a custom analyzer) @@ -654,7 +651,7 @@ def test_review_type_not_overwritten_if_already_set(self, mock_client_class, moc mock_runner = Mock() mock_runner.validate_claude_available.return_value = (True, "") - mock_runner.run_code_review.return_value = (True, "", {'findings': findings}) + _mock_multiphase_success(mock_runner, findings=findings) mock_runner_class.return_value = mock_runner mock_filter = Mock() diff --git a/claudecode/test_review_orchestrator.py b/claudecode/test_review_orchestrator.py index e28bbc5..8dec762 100644 --- a/claudecode/test_review_orchestrator.py +++ b/claudecode/test_review_orchestrator.py @@ -20,26 +20,16 @@ def test_review_model_config_from_env_with_fallbacks(): assert cfg.validation == "global-model" -def test_orchestrator_legacy_single_pass_path(): +def test_orchestrator_invalid_triage_schema_fails(): runner = Mock() - runner.run_code_review.return_value = ( + runner.run_prompt.return_value = ( True, "", - { - "findings": [ - {"file": "a.py", "line": 3, "severity": "HIGH", "category": "security", "title": "Issue"} - ], - "analysis_summary": {"files_reviewed": 1}, - }, + {"findings": [{"file": "a.py", "line": 3, "severity": "HIGH", "category": "security", "title": "Issue"}]}, ) findings_filter = Mock() - findings_filter.filter_findings.return_value = ( - True, - {"filtered_findings": runner.run_code_review.return_value[2]["findings"]}, - Mock(), - ) github_client = Mock() - github_client._is_excluded.return_value = False + github_client.is_excluded.return_value = False cfg = ReviewModelConfig("t", "c", "q", "s", "v") orchestrator = ReviewOrchestrator(runner, findings_filter, github_client, cfg, 100) @@ -49,7 +39,6 @@ def test_orchestrator_legacy_single_pass_path(): pr_diff="diff --git a/a.py b/a.py", ) - assert ok is True - assert err == "" - assert len(result["findings"]) == 1 - assert result["findings"][0]["review_type"] == "security" + assert ok is False + assert result == {} + assert "invalid schema" in err.lower() diff --git a/claudecode/test_workflow_integration.py b/claudecode/test_workflow_integration.py index e5410d6..325d062 100644 --- a/claudecode/test_workflow_integration.py +++ b/claudecode/test_workflow_integration.py @@ -13,6 +13,31 @@ from claudecode.github_action_audit import main +def _mock_claude_result(payload): + """Build a mocked subprocess result containing JSON payload.""" + return Mock(returncode=0, stdout=json.dumps(payload), stderr='') + + +def _multiphase_run_side_effect(findings): + """Create subprocess.run side effects for strict multi-phase flow.""" + return [ + Mock(returncode=0, stdout='claude version 1.0.0', stderr=''), # claude --version + _mock_claude_result({"skip_review": False, "reason": "", "risk_level": "medium"}), # triage + _mock_claude_result({"claude_md_files": [], "change_summary": "", "hotspots": [], "priority_files": []}), # context + _mock_claude_result({"findings": findings}), # compliance + _mock_claude_result({"findings": []}), # quality + _mock_claude_result({"findings": []}), # security + _mock_claude_result( + { + "validated_findings": [ + {"finding_index": idx, "keep": True, "confidence": 0.95, "reason": "valid"} + for idx in range(len(findings)) + ] + } + ), # validation + ] + + class TestFullWorkflowIntegration: """Test complete workflow scenarios.""" @@ -242,23 +267,7 @@ def test_full_workflow_with_real_pr_structure(self, mock_get, mock_run): ] } - # Mock Claude CLI - version_result = Mock() - version_result.returncode = 0 - version_result.stdout = 'claude version 1.0.0' - version_result.stderr = '' - - audit_result = Mock() - audit_result.returncode = 0 - # Claude wraps the result in a specific format - claude_wrapped_response = { - 'result': json.dumps(claude_response) - } - audit_result.stdout = json.dumps(claude_wrapped_response) - audit_result.stderr = '' - - # Provide results for unified review (single pass) - mock_run.side_effect = [version_result, audit_result] + mock_run.side_effect = _multiphase_run_side_effect(claude_response["findings"]) # Run the workflow with tempfile.TemporaryDirectory() as tmpdir: @@ -281,10 +290,10 @@ def test_full_workflow_with_real_pr_structure(self, mock_get, mock_run): # Verify API calls # 5 calls: PR data, files, diff, comments, reactions for bot comment assert mock_get.call_count == 5 - assert mock_run.call_count == 2 # 1 version check + 1 unified review + assert mock_run.call_count == 7 # 1 version check + 6 phase calls # Verify the audit was run with proper prompt - audit_call = mock_run.call_args_list[1] + audit_call = mock_run.call_args_list[1] # triage prompt prompt = audit_call[1]['input'] assert 'Add new authentication feature' in prompt # Title assert 'src/auth/oauth2.py' in prompt # File name @@ -337,11 +346,7 @@ def test_workflow_with_llm_filtering(self, mock_get, mock_run): } ] - mock_run.side_effect = [ - Mock(returncode=0, stdout='claude version 1.0.0', stderr=''), - Mock(returncode=0, stdout=json.dumps({"findings": claude_findings}), stderr=''), - Mock(returncode=0, stdout=json.dumps({"findings": []}), stderr='') - ] + mock_run.side_effect = _multiphase_run_side_effect(claude_findings) with tempfile.TemporaryDirectory() as tmpdir: os.chdir(tmpdir) @@ -415,11 +420,7 @@ def test_workflow_with_no_review_issues(self, mock_get, mock_run): mock_get.side_effect = [pr_response, files_response, diff_response] # Claude finds no issues - mock_run.side_effect = [ - Mock(returncode=0, stdout='claude version 1.0.0', stderr=''), - Mock(returncode=0, stdout='{"findings": [], "analysis_summary": {"review_completed": true}}', stderr=''), - Mock(returncode=0, stdout='{"findings": [], "analysis_summary": {"review_completed": true}}', stderr='') - ] + mock_run.side_effect = _multiphase_run_side_effect([]) with tempfile.TemporaryDirectory() as tmpdir: os.chdir(tmpdir) @@ -506,11 +507,7 @@ def test_workflow_with_massive_pr(self, mock_get, mock_run): mock_get.side_effect = [pr_response, files_response, diff_response] # Claude handles it gracefully - mock_run.side_effect = [ - Mock(returncode=0, stdout='claude version 1.0.0', stderr=''), - Mock(returncode=0, stdout='{"findings": []}', stderr=''), - Mock(returncode=0, stdout='{"findings": []}', stderr='') - ] + mock_run.side_effect = _multiphase_run_side_effect([]) with tempfile.TemporaryDirectory() as tmpdir: os.chdir(tmpdir) @@ -586,11 +583,7 @@ def test_workflow_with_binary_files(self, mock_get, mock_run): mock_get.side_effect = [pr_response, files_response, diff_response] - mock_run.side_effect = [ - Mock(returncode=0, stdout='claude version 1.0.0', stderr=''), - Mock(returncode=0, stdout='{"findings": []}', stderr=''), - Mock(returncode=0, stdout='{"findings": []}', stderr='') - ] + mock_run.side_effect = _multiphase_run_side_effect([]) with tempfile.TemporaryDirectory() as tmpdir: os.chdir(tmpdir) From 9aaa3eda12b916ad07c4e9ca22359d6ff328dcc6 Mon Sep 17 00:00:00 2001 From: Matej Bukovinski Date: Fri, 13 Feb 2026 14:02:11 +0100 Subject: [PATCH 11/12] Use JSON schema only for legacy unified review path --- claudecode/github_action_audit.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/claudecode/github_action_audit.py b/claudecode/github_action_audit.py index 8decdd5..2ba78d4 100644 --- a/claudecode/github_action_audit.py +++ b/claudecode/github_action_audit.py @@ -335,7 +335,13 @@ def __init__(self, timeout_minutes: Optional[int] = None): else: self.timeout_seconds = SUBPROCESS_TIMEOUT - def run_prompt(self, repo_dir: Path, prompt: str, model: Optional[str] = None) -> Tuple[bool, str, Any]: + def run_prompt( + self, + repo_dir: Path, + prompt: str, + model: Optional[str] = None, + json_schema: Optional[Dict[str, Any]] = None, + ) -> Tuple[bool, str, Any]: """Run a single Claude prompt and return parsed JSON payload or text.""" if not repo_dir.exists(): return False, f"Repository directory does not exist: {repo_dir}", {} @@ -351,8 +357,9 @@ def run_prompt(self, repo_dir: Path, prompt: str, model: Optional[str] = None) - '--output-format', 'json', '--model', model_name, '--disallowed-tools', 'Bash(ps:*)', - '--json-schema', json.dumps(REVIEW_OUTPUT_SCHEMA), ] + if json_schema: + cmd.extend(['--json-schema', json.dumps(json_schema)]) NUM_RETRIES = 3 for attempt in range(NUM_RETRIES): @@ -407,7 +414,12 @@ def run_prompt(self, repo_dir: Path, prompt: str, model: Optional[str] = None) - def run_code_review(self, repo_dir: Path, prompt: str, model: Optional[str] = None) -> Tuple[bool, str, Dict[str, Any]]: """Run code review prompt and normalize to findings payload.""" - success, error_msg, parsed = self.run_prompt(repo_dir, prompt, model=model or DEFAULT_CLAUDE_MODEL) + success, error_msg, parsed = self.run_prompt( + repo_dir, + prompt, + model=model or DEFAULT_CLAUDE_MODEL, + json_schema=REVIEW_OUTPUT_SCHEMA, + ) if not success: return False, error_msg, {} if isinstance(parsed, dict) and 'findings' in parsed: From e55a3e8fb5b3a235f4837546e6628a7596f87a89 Mon Sep 17 00:00:00 2001 From: Matej Bukovinski Date: Fri, 13 Feb 2026 14:11:23 +0100 Subject: [PATCH 12/12] Remove legacy single-pass compatibility and keep multi-agent paths only --- README.md | 2 - action.yml | 7 - claudecode/format_pr_comments.py | 314 --------- claudecode/github_action_audit.py | 151 +--- claudecode/prompts.py | 106 --- claudecode/review_orchestrator.py | 3 +- claudecode/test_claude_runner.py | 472 +++---------- claudecode/test_format_pr_comments.py | 372 ---------- claudecode/test_github_action_audit.py | 540 ++------------- claudecode/test_helper_functions.py | 30 - claudecode/test_main_function.py | 648 ++---------------- claudecode/test_prompts.py | 256 ++----- claudecode/test_workflow_integration.py | 4 +- scripts/determine-claudecode-enablement.sh | 8 +- .../test-determine-claudecode-enablement.sh | 24 +- 15 files changed, 265 insertions(+), 2672 deletions(-) delete mode 100644 claudecode/format_pr_comments.py delete mode 100644 claudecode/test_format_pr_comments.py diff --git a/README.md b/README.md index b55c320..59f8179 100644 --- a/README.md +++ b/README.md @@ -118,7 +118,6 @@ This action is not hardened against prompt injection attacks and should only be | `model-security` | Model used for security phase. | `claude-opus-4-6` | No | | `model-validation` | Model used for finding validation phase. | `claude-sonnet-4-5` | No | | `claudecode-timeout` | Timeout for ClaudeCode analysis in minutes | `20` | No | -| `run-every-commit` | Run ClaudeCode on every commit (skips cache check). Warning: May increase false positives on PRs with many commits. **Deprecated**: Use `trigger-on-commit` instead. | `false` | No | | `trigger-on-open` | Run review when PR is first opened | `true` | No | | `trigger-on-commit` | Run review on every new commit | `false` | No | | `trigger-on-review-request` | Run review when someone requests a review from the bot | `true` | No | @@ -128,7 +127,6 @@ This action is not hardened against prompt injection attacks and should only be | `false-positive-filtering-instructions` | Path to custom false positive filtering instructions text file | None | No | | `custom-review-instructions` | Path to custom code review instructions text file to append to the audit prompt | None | No | | `custom-security-scan-instructions` | Path to custom security scan instructions text file to append to the security section | None | No | -| `dismiss-stale-reviews` | Dismiss previous bot reviews when posting a new review (useful for follow-up commits) | `true` | No | | `skip-draft-prs` | Skip code review on draft pull requests | `true` | No | | `app-slug` | GitHub App slug for bot mention detection. If using `actions/create-github-app-token@v1.9.0+`, pass `${{ steps.app-token.outputs.app-slug }}`. Otherwise defaults to `github-actions`. | `github-actions` | No | | `require-label` | Only run review if this label is present. Leave empty to review all PRs. Add `labeled` to your workflow `pull_request` types to trigger on label addition. | None | No | diff --git a/action.yml b/action.yml index a5708a5..241b8bb 100644 --- a/action.yml +++ b/action.yml @@ -58,12 +58,6 @@ inputs: required: false default: 'claude-sonnet-4-5' - run-every-commit: - description: 'DEPRECATED: Use trigger-on-commit instead. Run ClaudeCode on every commit (skips cache check). Warning: This may lead to more false positives on PRs with many commits as the AI analyzes the same code multiple times.' - required: false - default: 'false' - deprecationMessage: 'run-every-commit is deprecated. Use trigger-on-commit instead for more granular control over when reviews run.' - false-positive-filtering-instructions: description: 'Path to custom false positive filtering instructions text file' required: false @@ -274,7 +268,6 @@ runs: GITHUB_EVENT_NAME: ${{ github.event_name }} PR_NUMBER: ${{ github.event.pull_request.number || steps.pr-info.outputs.pr_number }} GITHUB_SHA: ${{ github.event.pull_request.head.sha || steps.pr-info.outputs.pr_sha || github.sha }} - RUN_EVERY_COMMIT: ${{ inputs.run-every-commit }} TRIGGER_ON_OPEN: ${{ inputs.trigger-on-open }} TRIGGER_ON_COMMIT: ${{ inputs.trigger-on-commit }} TRIGGER_ON_REVIEW_REQUEST: ${{ inputs.trigger-on-review-request }} diff --git a/claudecode/format_pr_comments.py b/claudecode/format_pr_comments.py deleted file mode 100644 index eb60fe3..0000000 --- a/claudecode/format_pr_comments.py +++ /dev/null @@ -1,314 +0,0 @@ -"""Format PR comments for review context in prompts.""" - -from typing import Dict, Any, List -from datetime import datetime - -from claudecode.logger import get_logger - -logger = get_logger(__name__) - -# Maximum characters for review context to avoid prompt bloat -MAX_CONTEXT_CHARS = 15000 -# Maximum replies to include per thread -MAX_REPLIES_PER_THREAD = 5 -# Bot comment marker pattern -BOT_COMMENT_MARKER = "🤖 **Code Review Finding:" - -# GitHub reaction content to emoji mapping -REACTION_EMOJI_MAP = { - '+1': '👍', - '-1': '👎', - 'laugh': '😄', - 'confused': '😕', - 'heart': '❤️', - 'hooray': '🎉', - 'rocket': '🚀', - 'eyes': '👀', -} - - -def is_bot_comment(comment: Dict[str, Any]) -> bool: - """Check if a comment was posted by this bot. - - Args: - comment: Comment dictionary from GitHub API. - - Returns: - True if this is a bot review comment. - """ - body = comment.get('body', '') - user = comment.get('user', {}) - - return BOT_COMMENT_MARKER in body and user.get('type') == 'Bot' - - -def format_pr_comments_for_prompt( - bot_comment_threads: List[Dict[str, Any]], -) -> str: - """Format bot comment threads as review context for the prompt. - - Takes pre-built thread structures and formats them for Claude to consider - during re-review. - - Args: - bot_comment_threads: List of thread dicts, each containing: - - 'bot_comment': The original bot finding comment - - 'replies': List of user reply comments - - 'reactions': Dict of reaction counts (e.g., {'+1': 2, '-1': 1}) - - Returns: - Formatted string with previous review threads, or empty string if none. - """ - if not bot_comment_threads: - logger.info("No bot comment threads to format") - return "" - - # Apply reply truncation - threads = _truncate_replies(bot_comment_threads) - - logger.info(f"Formatting {len(threads)} bot comment thread(s)") - return _format_threads_for_prompt(threads) - - -def _truncate_replies(threads: List[Dict[str, Any]]) -> List[Dict[str, Any]]: - """Truncate long reply threads to keep prompt size manageable. - - Args: - threads: List of thread dictionaries. - - Returns: - Threads with replies truncated to MAX_REPLIES_PER_THREAD. - """ - result = [] - for thread in threads: - thread_copy = dict(thread) - replies = thread_copy.get('replies', []) - - if len(replies) > MAX_REPLIES_PER_THREAD: - truncated_count = len(replies) - MAX_REPLIES_PER_THREAD - thread_copy['replies'] = replies[-MAX_REPLIES_PER_THREAD:] - thread_copy['truncated_replies'] = truncated_count - - result.append(thread_copy) - - return result - - -def _parse_bot_comment(body: str) -> Dict[str, str]: - """Parse structured data from bot comment body. - - Args: - body: The comment body text. - - Returns: - Dictionary with extracted fields (title, severity, category, etc.) - """ - result = { - 'title': '', - 'severity': '', - 'category': '', - 'impact': '', - 'recommendation': '' - } - - # Extract title from "🤖 **Code Review Finding: {title}**" - if BOT_COMMENT_MARKER in body: - start = body.find(BOT_COMMENT_MARKER) + len(BOT_COMMENT_MARKER) - end = body.find('**', start) - if end > start: - result['title'] = body[start:end].strip() - - # Extract severity - if '**Severity:**' in body: - start = body.find('**Severity:**') + len('**Severity:**') - end = body.find('\n', start) - if end > start: - result['severity'] = body[start:end].strip() - - # Extract category - if '**Category:**' in body: - start = body.find('**Category:**') + len('**Category:**') - end = body.find('\n', start) - if end > start: - result['category'] = body[start:end].strip() - - # Extract impact - if '**Impact:**' in body: - start = body.find('**Impact:**') + len('**Impact:**') - end = body.find('\n\n', start) - if end == -1: - end = body.find('**Recommendation:**', start) - if end > start: - result['impact'] = body[start:end].strip() - - # Extract recommendation - if '**Recommendation:**' in body: - start = body.find('**Recommendation:**') + len('**Recommendation:**') - end = body.find('\n\n', start) - if end == -1: - end = body.find('```', start) # Code suggestion block - if end == -1: - end = len(body) - if end > start: - result['recommendation'] = body[start:end].strip() - - return result - - -def _format_timestamp(iso_timestamp: str) -> str: - """Format ISO timestamp to readable format. - - Args: - iso_timestamp: ISO 8601 timestamp string. - - Returns: - Human-readable timestamp. - """ - try: - dt = datetime.fromisoformat(iso_timestamp.replace('Z', '+00:00')) - return dt.strftime('%Y-%m-%d %H:%M UTC') - except (ValueError, AttributeError): - return iso_timestamp - - -def _format_threads_for_prompt(threads: List[Dict[str, Any]]) -> str: - """Format threads as readable text for Claude's prompt. - - Args: - threads: List of thread dictionaries. - - Returns: - Formatted string for inclusion in prompt. - """ - if not threads: - return "" - - lines = [ - "", - "═" * 65, - "PREVIOUS REVIEW CONTEXT", - "═" * 65, - "", - "The following findings were raised in previous reviews of this PR.", - "Review user responses to determine if issues should be re-raised.", - "" - ] - - for i, thread in enumerate(threads, 1): - bot_comment = thread['bot_comment'] - replies = thread['replies'] - reactions = thread.get('reactions', {}) - - # Get file and line info - file_path = bot_comment.get('path', 'unknown') - line = bot_comment.get('line') or bot_comment.get('original_line', '?') - - # Parse bot comment structure - parsed = _parse_bot_comment(bot_comment.get('body', '')) - - # Format thread header - lines.append(f"THREAD {i} - {file_path}:{line}") - lines.append("─" * 65) - - # Bot finding - timestamp = _format_timestamp(bot_comment.get('created_at', '')) - lines.append(f"Bot Finding ({timestamp})") - - if parsed['severity']: - lines.append(f" Severity: {parsed['severity']}") - if parsed['category']: - lines.append(f" Category: {parsed['category']}") - if parsed['title']: - lines.append(f" Title: {parsed['title']}") - if parsed['impact']: - lines.append(f" Impact: {parsed['impact'][:500]}...") # Truncate long impacts - if parsed['recommendation']: - lines.append(f" Recommendation: {parsed['recommendation'][:500]}...") - - # Add user reactions (excluding bot's own reactions) - if reactions: - reaction_parts = [] - for reaction, count in reactions.items(): - if count > 0: - emoji = REACTION_EMOJI_MAP.get(reaction, reaction) - reaction_parts.append(f"{emoji} {count}") - - if reaction_parts: - lines.append(f" User Reactions: {', '.join(reaction_parts)}") - - lines.append("") - - # Truncation notice - if thread.get('truncated_replies'): - lines.append(f" ({thread['truncated_replies']} earlier replies omitted)") - lines.append("") - - # User replies - for reply in replies: - user = reply.get('user', {}).get('login', 'unknown') - reply_timestamp = _format_timestamp(reply.get('created_at', '')) - reply_body = reply.get('body', '').strip() - - # Truncate very long replies - if len(reply_body) > 1000: - reply_body = reply_body[:1000] + "... (truncated)" - - lines.append(f"User Reply ({user}, {reply_timestamp}):") - # Indent reply text - for reply_line in reply_body.split('\n'): - lines.append(f" {reply_line}") - lines.append("") - - lines.append("─" * 65) - lines.append("") - - # Add instructions for re-review - lines.extend([ - "═" * 65, - "INSTRUCTIONS FOR RE-REVIEW", - "═" * 65, - "", - "When reviewing this PR with the above context:", - "", - "1. CHECK IF ISSUES WERE ADDRESSED: Compare previous findings against", - " the current diff. If code was changed to fix the issue, do NOT", - " re-raise it.", - "", - "2. EVALUATE USER RESPONSES: Read user replies carefully. Valid", - " dismissals include:", - " - Demonstrating the issue is a false positive with evidence", - " - Showing existing mitigations the bot missed", - " - Providing strong technical justification", - "", - " Invalid dismissals include:", - " - \"We'll fix this later\" (without code change)", - " - Misunderstanding the vulnerability/issue", - " - Ignoring the issue without explanation", - "", - "3. CONSIDER USER REACTIONS: Reactions provide additional signal:", - " - 👎 (thumbs down) suggests users found the finding unhelpful", - " - 👍 (thumbs up) suggests users found the finding valuable", - " - High 👎 count with no reply may indicate obvious false positive", - " - Use reactions as one input, but prioritize reply content", - "", - "4. RE-RAISE WHEN APPROPRIATE: If an issue was invalidly dismissed", - " or remains unaddressed, re-raise it with:", - " - Reference to the previous discussion", - " - Response to the user's dismissal reasoning", - " - Updated title: \"[Issue Title] (previously raised)\"", - "", - "5. DO NOT REPEAT RESOLVED ISSUES: If code was changed to address", - " a finding, do not mention it unless the fix is incomplete.", - "", - "═" * 65, - "" - ]) - - result = '\n'.join(lines) - - # Truncate if too long - if len(result) > MAX_CONTEXT_CHARS: - logger.warning(f"Review context truncated from {len(result)} to {MAX_CONTEXT_CHARS} chars") - result = result[:MAX_CONTEXT_CHARS] + "\n\n(Review context truncated due to length)\n" - - return result \ No newline at end of file diff --git a/claudecode/github_action_audit.py b/claudecode/github_action_audit.py index 2ba78d4..ca80af4 100644 --- a/claudecode/github_action_audit.py +++ b/claudecode/github_action_audit.py @@ -17,8 +17,6 @@ # Import existing components we can reuse from claudecode.findings_filter import FindingsFilter from claudecode.json_parser import parse_json_with_fallbacks -from claudecode.format_pr_comments import format_pr_comments_for_prompt, is_bot_comment -from claudecode.prompts import get_unified_review_prompt # Backward-compatible import for tests/extensions. from claudecode.review_orchestrator import ReviewModelConfig, ReviewOrchestrator from claudecode.constants import ( EXIT_CONFIGURATION_ERROR, @@ -28,7 +26,6 @@ SUBPROCESS_TIMEOUT ) from claudecode.logger import get_logger -from claudecode.review_schema import REVIEW_OUTPUT_SCHEMA logger = get_logger(__name__) @@ -36,10 +33,6 @@ class ConfigurationError(ValueError): """Raised when configuration is invalid or missing.""" pass -class AuditError(ValueError): - """Raised when code review operations fail.""" - pass - class GitHubActionClient: """Simplified GitHub API client for GitHub Actions environment.""" @@ -204,56 +197,6 @@ def get_pr_diff(self, repo_name: str, pr_number: int) -> str: return self._filter_generated_files(response.text) - def get_pr_comments(self, repo_name: str, pr_number: int) -> List[Dict[str, Any]]: - """Get all review comments for a PR with pagination.""" - all_comments = [] - page = 1 - per_page = 100 - - while True: - url = f"https://api.github.com/repos/{repo_name}/pulls/{pr_number}/comments" - params = {'per_page': per_page, 'page': page} - - try: - response = requests.get(url, headers=self.headers, params=params) - response.raise_for_status() - comments = response.json() - - if not comments: - break - - all_comments.extend(comments) - if len(comments) < per_page: - break - page += 1 - except requests.RequestException as e: - logger.warning(f"Failed to fetch comments page {page}: {e}") - break - - return all_comments - - def get_comment_reactions(self, repo_name: str, comment_id: int) -> Dict[str, int]: - """Get reactions for a specific comment, excluding bot reactions.""" - url = f"https://api.github.com/repos/{repo_name}/pulls/comments/{comment_id}/reactions" - - try: - response = requests.get(url, headers=self.headers) - response.raise_for_status() - reactions = response.json() - - counts: Dict[str, int] = {} - for reaction in reactions: - user = reaction.get('user', {}) - if user.get('type') == 'Bot': - continue - content = reaction.get('content', '') - if content: - counts[content] = counts.get(content, 0) + 1 - return counts - except requests.RequestException as e: - logger.debug(f"Failed to fetch reactions for comment {comment_id}: {e}") - return {} - def _is_excluded(self, filepath: str) -> bool: """Check if a file should be excluded based on directory or file patterns.""" import fnmatch @@ -412,46 +355,6 @@ def run_prompt( except Exception as e: return False, f"Claude Code execution error: {str(e)}", {} - def run_code_review(self, repo_dir: Path, prompt: str, model: Optional[str] = None) -> Tuple[bool, str, Dict[str, Any]]: - """Run code review prompt and normalize to findings payload.""" - success, error_msg, parsed = self.run_prompt( - repo_dir, - prompt, - model=model or DEFAULT_CLAUDE_MODEL, - json_schema=REVIEW_OUTPUT_SCHEMA, - ) - if not success: - return False, error_msg, {} - if isinstance(parsed, dict) and 'findings' in parsed: - return True, "", parsed - return True, "", self._extract_review_findings(parsed) - - def _extract_review_findings(self, claude_output: Any) -> Dict[str, Any]: - """Extract review findings from Claude's JSON response.""" - if isinstance(claude_output, dict): - if 'result' in claude_output: - result_text = claude_output['result'] - if isinstance(result_text, str): - # Try to extract JSON from the result text - success, result_json = parse_json_with_fallbacks(result_text, "Claude result text") - if success and result_json and 'findings' in result_json: - if 'pr_summary' not in result_json: - result_json['pr_summary'] = {} - return result_json - - # Return empty structure if no findings found - return { - 'findings': [], - 'pr_summary': {}, - 'analysis_summary': { - 'files_reviewed': 0, - 'high_severity': 0, - 'medium_severity': 0, - 'low_severity': 0, - 'review_completed': False, - } - } - def validate_claude_available(self) -> Tuple[bool, str]: """Validate that Claude Code is available.""" try: @@ -585,30 +488,6 @@ def get_max_diff_lines() -> int: -def run_code_review(claude_runner: SimpleClaudeRunner, prompt: str) -> Dict[str, Any]: - """Run the code review with Claude Code. - - Args: - claude_runner: Claude runner instance - prompt: The review prompt - - Returns: - Review results dictionary - - Raises: - AuditError: If the review fails - """ - # Get repo directory from environment or use current directory - repo_path = os.environ.get('REPO_PATH') - repo_dir = Path(repo_path) if repo_path else Path.cwd() - success, error_msg, results = claude_runner.run_code_review(repo_dir, prompt) - - if not success: - raise AuditError(f'Code review failed: {error_msg}') - - return results - - def apply_findings_filter(findings_filter, original_findings: List[Dict[str, Any]], pr_context: Dict[str, Any], github_client: GitHubActionClient) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], Dict[str, Any]]: """Apply findings filter to reduce false positives. @@ -670,7 +549,7 @@ def _is_finding_in_excluded_directory(finding: Dict[str, Any], github_client: Gi if not file_path: return False - return github_client._is_excluded(file_path) + return github_client.is_excluded(file_path) def main(): @@ -744,34 +623,6 @@ def main(): print(json.dumps({'error': f'Failed to fetch PR data: {str(e)}'})) sys.exit(EXIT_GENERAL_ERROR) - # Backward-compatible context collection for review thread history. - review_context = None - try: - pr_comments = github_client.get_pr_comments(repo_name, pr_number) - if pr_comments: - bot_comment_threads = [] - for comment in pr_comments: - if is_bot_comment(comment): - reactions = github_client.get_comment_reactions(repo_name, comment['id']) - replies = [ - c for c in pr_comments - if c.get('in_reply_to_id') == comment['id'] - ] - replies.sort(key=lambda c: c.get('created_at', '')) - bot_comment_threads.append({ - 'bot_comment': comment, - 'replies': replies, - 'reactions': reactions, - }) - bot_comment_threads.sort(key=lambda t: t['bot_comment'].get('created_at', '')) - if bot_comment_threads: - review_context = format_pr_comments_for_prompt(bot_comment_threads) - if review_context: - logger.info(f"Fetched previous review context ({len(review_context)} chars)") - except Exception as e: - logger.warning(f"Failed to fetch review context (continuing without it): {e}") - review_context = None - max_diff_lines = get_max_diff_lines() diff_line_count = len(pr_diff.splitlines()) if max_diff_lines and diff_line_count > max_diff_lines: diff --git a/claudecode/prompts.py b/claudecode/prompts.py index 9ae3f2e..7684710 100644 --- a/claudecode/prompts.py +++ b/claudecode/prompts.py @@ -40,32 +40,6 @@ def _build_hybrid_diff_section(pr_diff: str, max_lines: int) -> str: "Use this as a starting point only. You MUST validate findings with repository tool reads.\n" ) -def _build_diff_section(pr_diff: Optional[str], include_diff: bool) -> str: - """Build unified-prompt diff section for backward compatibility.""" - if pr_diff and include_diff: - return f""" - -PR DIFF CONTENT: -``` -{pr_diff} -``` - -Review the complete diff above. This contains all code changes in the PR. -""" - - return """ - -IMPORTANT - FILE READING INSTRUCTIONS: -You have access to the repository files. For each file listed above, use the Read tool to examine the changes. -Focus on the files that are most likely to contain issues based on the PR context. - -To review effectively: -1. Read each modified file to understand the current code -2. Look at surrounding code context when needed to understand the changes -3. Check related files if you need to understand dependencies or usage patterns -""" - - def _base_context_block(pr_data: Dict[str, Any], pr_diff: str, max_diff_lines: int) -> str: """Shared context block used across prompts.""" files_changed = _format_files_changed(pr_data) @@ -299,83 +273,3 @@ def build_validation_prompt( }} """ - -def get_unified_review_prompt( - pr_data: Dict[str, Any], - pr_diff: Optional[str] = None, - include_diff: bool = True, - custom_review_instructions: Optional[str] = None, - custom_security_instructions: Optional[str] = None, - review_context: Optional[str] = None, -) -> str: - """Backward-compatible unified prompt used by tests and direct calls.""" - files_changed = _format_files_changed(pr_data) - diff_section = _build_diff_section(pr_diff, include_diff) - custom_review_section = f"\n{custom_review_instructions}\n" if custom_review_instructions else "" - custom_security_section = f"\n{custom_security_instructions}\n" if custom_security_instructions else "" - - pr_description = (pr_data.get('body', '') or '').strip() - pr_description_section = "" - if pr_description: - if len(pr_description) > 2000: - pr_description = pr_description[:2000] + "... (truncated)" - pr_description_section = f"\nPR Description:\n{pr_description}\n" - - review_context_section = review_context or "" - - return f""" -You are a senior engineer conducting a comprehensive code review of GitHub PR #{pr_data.get('number', 'unknown')}: "{pr_data.get('title', 'unknown')}" - -CONTEXT: -- Repository: {pr_data.get('head', {}).get('repo', {}).get('full_name', 'unknown')} -- Author: {pr_data.get('user', 'unknown')} -- Files changed: {pr_data.get('changed_files', 0)} -- Lines added: {pr_data.get('additions', 0)} -- Lines deleted: {pr_data.get('deletions', 0)} -{pr_description_section} -Files modified: -{files_changed}{diff_section}{review_context_section} - -OBJECTIVE: -Perform a focused, high-signal code review to identify HIGH-CONFIDENCE issues introduced by this PR. - -CODE QUALITY CATEGORIES: -- correctness, reliability, performance, maintainability, testing -{custom_review_section} -SECURITY CATEGORIES: -- input validation, authn/authz, crypto/secrets, code execution, data exposure -{custom_security_section} - -REQUIRED OUTPUT FORMAT: - -{{ - "pr_summary": {{ - "overview": "2-4 sentence summary of what this PR changes and why it matters", - "file_changes": [ - {{ - "label": "src/auth.py", - "files": ["src/auth.py"], - "changes": "Brief description of changes (~10 words)" - }} - ] - }}, - "findings": [ - {{ - "file": "path/to/file.py", - "line": 42, - "severity": "HIGH|MEDIUM|LOW", - "category": "correctness|reliability|performance|maintainability|testing|security", - "title": "Short summary of the issue", - "description": "What is wrong and where it happens", - "impact": "Concrete impact or failure mode", - "recommendation": "Actionable fix or mitigation", - "confidence": 0.95 - }} - ] -}} - -PR SUMMARY GUIDELINES: -- overview: 2-4 sentences describing WHAT changed and WHY (purpose/goal) -- file_changes: One entry per file or group of related files -- changes: Brief description (~10 words), focus on purpose not implementation -""" diff --git a/claudecode/review_orchestrator.py b/claudecode/review_orchestrator.py index 8f70bac..1a8ca55 100644 --- a/claudecode/review_orchestrator.py +++ b/claudecode/review_orchestrator.py @@ -87,8 +87,7 @@ def _is_excluded(self, filepath: str) -> bool: checker = getattr(self.github_client, "is_excluded", None) if callable(checker): return bool(checker(filepath)) - # Fallback for current client/tests that still expose private method. - return bool(self.github_client._is_excluded(filepath)) + raise AttributeError("github_client must implement is_excluded(filepath)") def _collect_phase_findings(self, phase_result: Dict[str, Any], source_agent: str) -> List[Dict[str, Any]]: findings: List[Dict[str, Any]] = [] diff --git a/claudecode/test_claude_runner.py b/claudecode/test_claude_runner.py index 296fb78..403894b 100644 --- a/claudecode/test_claude_runner.py +++ b/claudecode/test_claude_runner.py @@ -1,438 +1,132 @@ #!/usr/bin/env python3 -""" -Unit tests for SimpleClaudeRunner. -""" +"""Unit tests for SimpleClaudeRunner.""" import json import os import subprocess -from unittest.mock import Mock, patch from pathlib import Path +from unittest.mock import Mock, patch -from claudecode.github_action_audit import SimpleClaudeRunner from claudecode.constants import DEFAULT_CLAUDE_MODEL +from claudecode.github_action_audit import SimpleClaudeRunner class TestSimpleClaudeRunner: - """Test SimpleClaudeRunner functionality.""" - def test_init(self): - """Test runner initialization.""" runner = SimpleClaudeRunner(timeout_minutes=30) assert runner.timeout_seconds == 1800 - - runner2 = SimpleClaudeRunner() # Default - assert runner2.timeout_seconds == 1200 # 20 minutes default - - @patch('subprocess.run') + + runner2 = SimpleClaudeRunner() + assert runner2.timeout_seconds == 1200 + + @patch("subprocess.run") def test_validate_claude_available_success(self, mock_run): - """Test successful Claude validation.""" - mock_run.return_value = Mock( - returncode=0, - stdout='claude version 1.0.0', - stderr='' - ) - - with patch.dict(os.environ, {'ANTHROPIC_API_KEY': 'test-key'}): + mock_run.return_value = Mock(returncode=0, stdout="claude version 1.0.0", stderr="") + + with patch.dict(os.environ, {"ANTHROPIC_API_KEY": "test-key"}): runner = SimpleClaudeRunner() success, error = runner.validate_claude_available() - + assert success is True - assert error == '' - mock_run.assert_called_once_with( - ['claude', '--version'], - capture_output=True, - text=True, - timeout=10 - ) - - @patch('subprocess.run') + assert error == "" + + @patch("subprocess.run") def test_validate_claude_available_no_api_key(self, mock_run): - """Test Claude validation without API key.""" - mock_run.return_value = Mock( - returncode=0, - stdout='claude version 1.0.0', - stderr='' - ) - - # Remove API key + mock_run.return_value = Mock(returncode=0, stdout="claude version 1.0.0", stderr="") + env = os.environ.copy() - env.pop('ANTHROPIC_API_KEY', None) - + env.pop("ANTHROPIC_API_KEY", None) with patch.dict(os.environ, env, clear=True): runner = SimpleClaudeRunner() success, error = runner.validate_claude_available() - - assert success is False - assert 'ANTHROPIC_API_KEY environment variable is not set' in error - - @patch('subprocess.run') - def test_validate_claude_available_not_installed(self, mock_run): - """Test Claude validation when not installed.""" - mock_run.side_effect = FileNotFoundError() - - runner = SimpleClaudeRunner() - success, error = runner.validate_claude_available() - - assert success is False - assert 'Claude Code is not installed or not in PATH' in error - - @patch('subprocess.run') - def test_validate_claude_available_error(self, mock_run): - """Test Claude validation with error.""" - mock_run.return_value = Mock( - returncode=1, - stdout='', - stderr='Error: Authentication failed' - ) - - runner = SimpleClaudeRunner() - success, error = runner.validate_claude_available() - - assert success is False - assert 'exit code 1' in error - assert 'Authentication failed' in error - - @patch('subprocess.run') - def test_validate_claude_available_timeout(self, mock_run): - """Test Claude validation timeout.""" - mock_run.side_effect = subprocess.TimeoutExpired(['claude'], 10) - - runner = SimpleClaudeRunner() - success, error = runner.validate_claude_available() - + assert success is False - assert 'timed out' in error - - def test_run_code_review_missing_directory(self): - """Test audit with missing directory.""" + assert "ANTHROPIC_API_KEY" in error + + def test_run_prompt_missing_directory(self): runner = SimpleClaudeRunner() - success, error, results = runner.run_code_review( - Path('/non/existent/path'), - "test prompt" - ) - + success, error, results = runner.run_prompt(Path("/non/existent/path"), "test prompt") + assert success is False - assert 'Repository directory does not exist' in error + assert "Repository directory does not exist" in error assert results == {} - - @patch('subprocess.run') - def test_run_code_review_success(self, mock_run): - """Test successful code review.""" - # Claude Code returns wrapped format with 'result' field - findings_data = { - "pr_summary": { - "overview": "Test PR summary", - "file_changes": [ - {"label": "test.py", "files": ["test.py"], "changes": "Test changes"} - ] - }, - "findings": [ - { - "file": "test.py", - "line": 10, - "severity": "HIGH", - "description": "SQL injection vulnerability" - } - ] - } - - audit_result = { - "result": json.dumps(findings_data) - } - - mock_run.return_value = Mock( - returncode=0, - stdout=json.dumps(audit_result), - stderr='' - ) - + + @patch("subprocess.run") + def test_run_prompt_success_with_nested_json(self, mock_run): + wrapped = {"result": json.dumps({"findings": [{"file": "test.py", "line": 1}]})} + mock_run.return_value = Mock(returncode=0, stdout=json.dumps(wrapped), stderr="") + runner = SimpleClaudeRunner() - with patch('pathlib.Path.exists', return_value=True): - success, error, results = runner.run_code_review( - Path('/tmp/test'), - "test prompt" - ) - + with patch("pathlib.Path.exists", return_value=True): + success, error, results = runner.run_prompt(Path("/tmp/test"), "test prompt") + assert success is True - assert error == '' - assert len(results['findings']) == 1 - assert results['findings'][0]['severity'] == 'HIGH' - - # Verify subprocess call - mock_run.assert_called_once() - call_args = mock_run.call_args - cmd = call_args[0][0] - assert cmd[0] == 'claude' - assert '--output-format' in cmd - assert 'json' in cmd - assert '--model' in cmd + assert error == "" + assert results["findings"][0]["file"] == "test.py" + + cmd = mock_run.call_args[0][0] + assert cmd[0] == "claude" + assert "--output-format" in cmd + assert "json" in cmd + assert "--model" in cmd assert DEFAULT_CLAUDE_MODEL in cmd - assert '--disallowed-tools' in cmd - assert 'Bash(ps:*)' in cmd - assert '--json-schema' in cmd - assert call_args[1]['input'] == 'test prompt' - assert call_args[1]['cwd'] == Path('/tmp/test') - - @patch('subprocess.run') - def test_run_code_review_large_prompt_warning(self, mock_run, capsys): - """Test warning for large prompts.""" - mock_run.return_value = Mock( - returncode=0, - stdout='{"findings": []}', - stderr='' - ) - - # Create a prompt larger than 1MB - large_prompt = 'x' * (1024 * 1024 + 1000) - - runner = SimpleClaudeRunner() - with patch('pathlib.Path.exists', return_value=True): - success, error, results = runner.run_code_review( - Path('/tmp/test'), - large_prompt - ) - - captured = capsys.readouterr() - assert '[Warning] Large prompt size' in captured.err - assert success is True - - @patch('subprocess.run') - def test_run_code_review_retry_on_failure(self, mock_run): - """Test retry logic on failure.""" - # First call fails, second succeeds - mock_run.side_effect = [ - Mock(returncode=1, stdout='', stderr='Temporary error'), - Mock(returncode=0, stdout='{"findings": []}', stderr='') - ] - + assert "--disallowed-tools" in cmd + assert "Bash(ps:*)" in cmd + + @patch("subprocess.run") + def test_run_prompt_adds_json_schema_when_provided(self, mock_run): + mock_run.return_value = Mock(returncode=0, stdout='{"ok": true}', stderr="") + runner = SimpleClaudeRunner() - with patch('pathlib.Path.exists', return_value=True): - success, error, results = runner.run_code_review( - Path('/tmp/test'), - "test prompt" + with patch("pathlib.Path.exists", return_value=True): + success, _, _ = runner.run_prompt( + Path("/tmp/test"), + "test prompt", + json_schema={"type": "object", "properties": {"ok": {"type": "boolean"}}}, ) - + assert success is True - assert error == '' - assert mock_run.call_count == 2 # Retried once - - @patch('subprocess.run') - def test_run_code_review_retry_on_error_during_execution(self, mock_run): - """Test retry on error_during_execution result.""" - error_result = { - "type": "result", - "subtype": "error_during_execution", - "error": "Temporary execution error" - } - - success_result = { - "result": json.dumps({ - "pr_summary": { - "overview": "Test", - "file_changes": [{"label": "test.py", "files": ["test.py"], "changes": "Test"}] - }, - "findings": [{"file": "test.py", "line": 1, "severity": "LOW", "description": "Issue"}] - }) - } - + cmd = mock_run.call_args[0][0] + assert "--json-schema" in cmd + + @patch("subprocess.run") + def test_run_prompt_retry_on_failure(self, mock_run): mock_run.side_effect = [ - Mock(returncode=0, stdout=json.dumps(error_result), stderr=''), - Mock(returncode=0, stdout=json.dumps(success_result), stderr='') + Mock(returncode=1, stdout="", stderr="Temporary error"), + Mock(returncode=0, stdout='{"findings": []}', stderr=""), ] - + runner = SimpleClaudeRunner() - with patch('pathlib.Path.exists', return_value=True): - success, error, results = runner.run_code_review( - Path('/tmp/test'), - "test prompt" - ) - + with patch("pathlib.Path.exists", return_value=True): + success, error, _ = runner.run_prompt(Path("/tmp/test"), "test prompt") + assert success is True - assert len(results['findings']) == 1 + assert error == "" assert mock_run.call_count == 2 - - @patch('subprocess.run') - def test_run_code_review_timeout(self, mock_run): - """Test timeout handling.""" - mock_run.side_effect = subprocess.TimeoutExpired(['claude'], 1200) - - runner = SimpleClaudeRunner() - with patch('pathlib.Path.exists', return_value=True): - success, error, results = runner.run_code_review( - Path('/tmp/test'), - "test prompt" - ) - - assert success is False - assert 'timed out after 20 minutes' in error - assert results == {} - - @patch('subprocess.run') - def test_run_code_review_json_parse_failure_with_retry(self, mock_run): - """Test JSON parse failure with retry.""" + + @patch("subprocess.run") + def test_run_prompt_json_parse_failure(self, mock_run): mock_run.side_effect = [ - Mock(returncode=0, stdout='Invalid JSON', stderr=''), - Mock(returncode=0, stdout='Still invalid', stderr='') + Mock(returncode=0, stdout="Invalid JSON", stderr=""), + Mock(returncode=0, stdout="Still invalid", stderr=""), ] - - runner = SimpleClaudeRunner() - with patch('pathlib.Path.exists', return_value=True): - success, error, results = runner.run_code_review( - Path('/tmp/test'), - "test prompt" - ) - - assert success is False - assert 'Failed to parse Claude output' in error - assert mock_run.call_count == 2 - - def test_extract_review_findings_claude_wrapper(self): - """Test extraction from Claude Code wrapper format.""" - runner = SimpleClaudeRunner() - - # Test with result field containing JSON string - claude_output = { - "result": json.dumps({ - "pr_summary": {"overview": "Test", "file_changes": []}, - "findings": [ - {"file": "test.py", "line": 10, "severity": "HIGH"} - ] - }) - } - result = runner._extract_review_findings(claude_output) - assert len(result['findings']) == 1 - assert result['findings'][0]['file'] == 'test.py' - - def test_extract_review_findings_direct_format(self): - """Test that direct findings format was removed - only wrapped format is supported.""" runner = SimpleClaudeRunner() + with patch("pathlib.Path.exists", return_value=True): + success, error, _ = runner.run_prompt(Path("/tmp/test"), "test prompt") - # Direct format (without 'result' wrapper) should return empty - claude_output = { - "findings": [ - {"file": "main.py", "line": 20, "severity": "MEDIUM"} - ], - "pr_summary": { - "overview": "Test", - "file_changes": [] - } - } - - result = runner._extract_review_findings(claude_output) - # Should return empty structure since direct format is not supported - assert len(result['findings']) == 0 - assert result['pr_summary'] == {} - - def test_extract_review_findings_text_fallback(self): - """Test that text fallback was removed - only JSON is supported.""" - runner = SimpleClaudeRunner() - - # Test with result containing text (not JSON) - claude_output = { - "result": "Found SQL injection vulnerability in database.py line 45" - } + assert success is False + assert "Failed to parse Claude output" in error + assert mock_run.call_count == 2 - # Should return empty findings since we don't parse text anymore - result = runner._extract_review_findings(claude_output) - assert len(result['findings']) == 0 - assert result['pr_summary'] == {} - - def test_extract_review_findings_empty(self): - """Test extraction with no findings.""" - runner = SimpleClaudeRunner() + @patch("subprocess.run") + def test_run_prompt_timeout(self, mock_run): + mock_run.side_effect = subprocess.TimeoutExpired(["claude"], 1200) - # Various empty formats - for output in [None, {}, {"result": ""}, {"other": "data"}]: - result = runner._extract_review_findings(output) - assert result['findings'] == [] - assert result['pr_summary'] == {} - - def test_create_findings_from_text(self): - """Test that _create_findings_from_text was removed.""" runner = SimpleClaudeRunner() - - # Method should not exist - assert not hasattr(runner, '_create_findings_from_text') - - def test_create_findings_from_text_no_issues(self): - """Test that _create_findings_from_text was removed.""" - runner = SimpleClaudeRunner() - - # Method should not exist - assert not hasattr(runner, '_create_findings_from_text') - + with patch("pathlib.Path.exists", return_value=True): + success, error, results = runner.run_prompt(Path("/tmp/test"), "test prompt") -class TestClaudeRunnerEdgeCases: - """Test edge cases and error scenarios.""" - - @patch('subprocess.run') - def test_claude_output_formats(self, mock_run): - """Test various Claude output formats.""" - runner = SimpleClaudeRunner() - - # Test nested JSON in result - result field should be string - nested_output = { - "type": "result", - "result": json.dumps({ - "pr_summary": {"overview": "Test", "file_changes": []}, - "findings": [ - {"file": "test.py", "line": 1, "severity": "HIGH", "description": "Issue"} - ] - }) - } - - with patch('pathlib.Path.exists', return_value=True): - mock_run.return_value = Mock( - returncode=0, - stdout=json.dumps(nested_output), - stderr='' - ) - - success, error, results = runner.run_code_review( - Path('/tmp/test'), - "test" - ) - - # Should extract findings from nested structure - assert success is True - assert len(results['findings']) == 1 - - @patch('subprocess.run') - def test_partial_json_recovery(self, mock_run): - """Test recovery from partial JSON output.""" - # Simulate truncated JSON - partial_json = '{"findings": [{"file": "test.py", "line": 10, "sev' - - mock_run.return_value = Mock( - returncode=0, - stdout=partial_json, - stderr='' - ) - - runner = SimpleClaudeRunner() - with patch('pathlib.Path.exists', return_value=True): - success, error, results = runner.run_code_review( - Path('/tmp/test'), - "test" - ) - - # Should fail to parse and retry - assert mock_run.call_count == 2 - - @patch('subprocess.run') - def test_exception_handling(self, mock_run): - """Test general exception handling.""" - mock_run.side_effect = Exception("Unexpected error") - - runner = SimpleClaudeRunner() - with patch('pathlib.Path.exists', return_value=True): - success, error, results = runner.run_code_review( - Path('/tmp/test'), - "test" - ) - assert success is False - assert 'Unexpected error' in error + assert "timed out" in error assert results == {} diff --git a/claudecode/test_format_pr_comments.py b/claudecode/test_format_pr_comments.py deleted file mode 100644 index b96fbc3..0000000 --- a/claudecode/test_format_pr_comments.py +++ /dev/null @@ -1,372 +0,0 @@ -"""Unit tests for the format_pr_comments module.""" - -import pytest -from claudecode.format_pr_comments import ( - format_pr_comments_for_prompt, - is_bot_comment, - _truncate_replies, - _parse_bot_comment, - _format_timestamp, - BOT_COMMENT_MARKER, -) - - -def _make_bot_comment( - comment_id: int, - file_path: str = "app.py", - line: int = 42, - title: str = "SQL Injection", - severity: str = "HIGH", - category: str = "security", - impact: str = "Attacker can access database", - recommendation: str = "Use parameterized queries", - created_at: str = "2024-01-15T10:00:00Z", -): - """Create a bot comment for testing.""" - body = f"""{BOT_COMMENT_MARKER} {title}** - -**Severity:** {severity} -**Category:** {category} - -**Impact:** {impact} - -**Recommendation:** {recommendation} -""" - return { - 'id': comment_id, - 'body': body, - 'path': file_path, - 'line': line, - 'created_at': created_at, - 'user': {'login': 'github-actions[bot]', 'type': 'Bot'}, - 'in_reply_to_id': None, - } - - -def _make_user_reply( - comment_id: int, - in_reply_to_id: int, - body: str, - username: str = "testuser", - created_at: str = "2024-01-15T11:00:00Z", -): - """Create a user reply comment for testing.""" - return { - 'id': comment_id, - 'body': body, - 'path': 'app.py', - 'line': 42, - 'created_at': created_at, - 'user': {'login': username, 'type': 'User'}, - 'in_reply_to_id': in_reply_to_id, - } - - -def _make_thread(bot_comment, replies=None, reactions=None): - """Create a thread structure for testing.""" - return { - 'bot_comment': bot_comment, - 'replies': replies or [], - 'reactions': reactions or {}, - } - - -class TestFormatPrCommentsForPrompt: - """Tests for format_pr_comments_for_prompt function.""" - - def test_no_threads(self): - """Test with no threads.""" - result = format_pr_comments_for_prompt([]) - assert result == "" - - def test_single_thread_no_replies(self): - """Test with single bot comment, no replies.""" - bot_comment = _make_bot_comment(1) - threads = [_make_thread(bot_comment)] - - result = format_pr_comments_for_prompt(threads) - - assert "PREVIOUS REVIEW CONTEXT" in result - assert "SQL Injection" in result - assert "HIGH" in result - assert "security" in result - assert "app.py:42" in result - - def test_thread_with_user_replies(self): - """Test with bot comment and user replies.""" - bot_comment = _make_bot_comment(1) - user_reply = _make_user_reply( - 2, 1, - "This is a false positive - we sanitize input at the controller level.", - username="alice" - ) - threads = [_make_thread(bot_comment, replies=[user_reply])] - - result = format_pr_comments_for_prompt(threads) - - assert "PREVIOUS REVIEW CONTEXT" in result - assert "SQL Injection" in result - assert "alice" in result - assert "false positive" in result - assert "sanitize input" in result - - def test_multiple_threads(self): - """Test with multiple bot findings and replies.""" - bot_comment1 = _make_bot_comment( - 1, "auth.py", 10, "Hardcoded Password", "HIGH", "security" - ) - bot_comment2 = _make_bot_comment( - 2, "api.py", 50, "N+1 Query", "MEDIUM", "performance", - created_at="2024-01-15T10:30:00Z" - ) - reply1 = _make_user_reply(3, 1, "Fixed in commit abc123") - reply2 = _make_user_reply(4, 2, "This is by design for now") - - threads = [ - _make_thread(bot_comment1, replies=[reply1]), - _make_thread(bot_comment2, replies=[reply2]), - ] - - result = format_pr_comments_for_prompt(threads) - - assert "THREAD 1" in result - assert "THREAD 2" in result - assert "Hardcoded Password" in result - assert "N+1 Query" in result - assert "Fixed in commit" in result - assert "by design" in result - - def test_reactions_included(self): - """Test that reactions are included when provided.""" - bot_comment = _make_bot_comment(1) - threads = [_make_thread(bot_comment, reactions={'-1': 2})] - - result = format_pr_comments_for_prompt(threads) - - # Should show 2 thumbs down - assert "👎 2" in result - - def test_all_reaction_types(self): - """Test that all GitHub reaction types are converted to emojis.""" - bot_comment = _make_bot_comment(1) - reactions = { - '+1': 3, - '-1': 1, - 'laugh': 2, - 'confused': 1, - 'heart': 4, - 'hooray': 2, - 'rocket': 1, - 'eyes': 3, - } - threads = [_make_thread(bot_comment, reactions=reactions)] - - result = format_pr_comments_for_prompt(threads) - - # All reactions should be converted to emojis - assert "👍 3" in result - assert "👎 1" in result - assert "😄 2" in result - assert "😕 1" in result - assert "❤️ 4" in result - assert "🎉 2" in result - assert "🚀 1" in result - assert "👀 3" in result - - def test_truncate_long_replies(self): - """Test that very long reply threads are truncated.""" - bot_comment = _make_bot_comment(1) - - # Create 10 replies (exceeds MAX_REPLIES_PER_THREAD of 5) - replies = [ - _make_user_reply( - i + 10, 1, f"Reply {i}", - created_at=f"2024-01-15T{10+i:02d}:00:00Z" - ) - for i in range(10) - ] - - threads = [_make_thread(bot_comment, replies=replies)] - result = format_pr_comments_for_prompt(threads) - - # Should mention truncation - assert "earlier replies omitted" in result - # Should have the later replies (5-9) - assert "Reply 9" in result - assert "Reply 8" in result - - -class TestIsBotComment: - """Tests for is_bot_comment function.""" - - def test_with_marker_and_bot(self): - """Test bot comment true case.""" - bot_comment = {'body': f'{BOT_COMMENT_MARKER} Test**', 'user': {'type': 'Bot'}} - assert is_bot_comment(bot_comment) is True - - def test_without_marker_user_type(self): - """Test non-bot comment detection.""" - user_comment = {'body': 'Regular comment', 'user': {'type': 'User'}} - assert is_bot_comment(user_comment) is False - - def test_by_user_type(self): - """Test bot comment detection but not our bot.""" - bot_comment = {'body': 'Some automated message', 'user': {'type': 'Bot'}} - assert is_bot_comment(bot_comment) is False - - def test_with_marker(self): - """Test comment with marker but not from a bot.""" - bot_comment = {'body': f'{BOT_COMMENT_MARKER} Test**', 'user': {'type': 'User'}} - assert is_bot_comment(bot_comment) is False - - -class TestTruncateReplies: - """Tests for _truncate_replies function.""" - - def test_no_truncation_needed(self): - """Test when replies are under the limit.""" - bot_comment = _make_bot_comment(1) - replies = [_make_user_reply(i, 1, f"Reply {i}") for i in range(3)] - threads = [_make_thread(bot_comment, replies=replies)] - - result = _truncate_replies(threads) - - assert len(result[0]['replies']) == 3 - assert 'truncated_replies' not in result[0] - - def test_truncation_applied(self): - """Test when replies exceed the limit.""" - bot_comment = _make_bot_comment(1) - replies = [ - _make_user_reply(i, 1, f"Reply {i}", created_at=f"2024-01-15T{10+i:02d}:00:00Z") - for i in range(10) - ] - threads = [_make_thread(bot_comment, replies=replies)] - - result = _truncate_replies(threads) - - assert len(result[0]['replies']) == 5 - assert result[0]['truncated_replies'] == 5 - - -class TestParseBotComment: - """Tests for _parse_bot_comment function.""" - - def test_parse_full_comment(self): - """Test parsing structured data from bot comment.""" - body = f"""{BOT_COMMENT_MARKER} SQL Injection Vulnerability** - -**Severity:** HIGH -**Category:** security - -**Impact:** Attacker can execute arbitrary SQL queries - -**Recommendation:** Use parameterized queries -""" - result = _parse_bot_comment(body) - - assert result['title'] == "SQL Injection Vulnerability" - assert result['severity'] == "HIGH" - assert result['category'] == "security" - assert "arbitrary SQL" in result['impact'] - assert "parameterized" in result['recommendation'] - - -class TestFormatTimestamp: - """Tests for _format_timestamp function.""" - - def test_valid_timestamp(self): - """Test formatting valid ISO timestamp.""" - result = _format_timestamp("2024-01-15T10:30:00Z") - assert "2024-01-15" in result - assert "10:30" in result - - def test_invalid_timestamp(self): - """Test handling invalid timestamp.""" - result = _format_timestamp("invalid") - assert result == "invalid" - - -class TestPromptsIntegration: - """Test prompts.py integration with review context.""" - - def test_unified_prompt_with_review_context(self): - """Test that review context is included in unified prompt.""" - from claudecode.prompts import get_unified_review_prompt - - pr_data = { - "number": 123, - "title": "Test PR", - "body": "Test description", - "user": "testuser", - "changed_files": 1, - "additions": 10, - "deletions": 5, - "head": {"repo": {"full_name": "owner/repo"}}, - "files": [{"filename": "app.py"}], - } - - review_context = """ -═══════════════════════════════════════════════════════════════ -PREVIOUS REVIEW CONTEXT -═══════════════════════════════════════════════════════════════ -THREAD 1 - app.py:42 -Bot Finding (2024-01-15): SQL Injection -User Reply: This is a false positive -""" - - prompt = get_unified_review_prompt( - pr_data, - pr_diff="diff --git a/app.py b/app.py", - review_context=review_context - ) - - assert "PREVIOUS REVIEW CONTEXT" in prompt - assert "SQL Injection" in prompt - assert "false positive" in prompt - - def test_unified_prompt_without_review_context(self): - """Test that prompt works without review context.""" - from claudecode.prompts import get_unified_review_prompt - - pr_data = { - "number": 123, - "title": "Test PR", - "body": "Test description", - "user": "testuser", - "changed_files": 1, - "additions": 10, - "deletions": 5, - "head": {"repo": {"full_name": "owner/repo"}}, - "files": [{"filename": "app.py"}], - } - - prompt = get_unified_review_prompt( - pr_data, - pr_diff="diff --git a/app.py b/app.py", - review_context=None - ) - - assert "PREVIOUS REVIEW CONTEXT" not in prompt - assert "CONTEXT:" in prompt - - def test_unified_prompt_includes_pr_description(self): - """Test that PR description is included in prompt.""" - from claudecode.prompts import get_unified_review_prompt - - pr_data = { - "number": 123, - "title": "Test PR", - "body": "This PR adds important security fixes for the auth module.", - "user": "testuser", - "changed_files": 1, - "additions": 10, - "deletions": 5, - "head": {"repo": {"full_name": "owner/repo"}}, - "files": [{"filename": "auth.py"}], - } - - prompt = get_unified_review_prompt(pr_data, pr_diff="diff") - - assert "PR Description:" in prompt - assert "security fixes for the auth module" in prompt \ No newline at end of file diff --git a/claudecode/test_github_action_audit.py b/claudecode/test_github_action_audit.py index dc4a7c7..c7c8c70 100644 --- a/claudecode/test_github_action_audit.py +++ b/claudecode/test_github_action_audit.py @@ -1,527 +1,95 @@ #!/usr/bin/env python3 -""" -Pytest tests for GitHub Action audit script components. -""" +"""Pytest tests for github_action_audit module components.""" class TestImports: - """Test that all required modules can be imported.""" - def test_main_module_import(self): - """Test that the main module can be imported.""" from claudecode import github_action_audit - assert hasattr(github_action_audit, 'GitHubActionClient') - assert hasattr(github_action_audit, 'SimpleClaudeRunner') - # SimpleFindingsFilter was removed - assert hasattr(github_action_audit, 'main') - + + assert hasattr(github_action_audit, "GitHubActionClient") + assert hasattr(github_action_audit, "SimpleClaudeRunner") + assert hasattr(github_action_audit, "main") + def test_component_imports(self): - """Test that all component modules can be imported.""" - from claudecode.prompts import get_unified_review_prompt - from claudecode.json_parser import parse_json_with_fallbacks, extract_json_from_text - - # Verify they're callable/usable - assert callable(get_unified_review_prompt) + from claudecode.json_parser import extract_json_from_text, parse_json_with_fallbacks + from claudecode.prompts import build_triage_prompt + assert callable(parse_json_with_fallbacks) assert callable(extract_json_from_text) + assert callable(build_triage_prompt) class TestHardExclusionRules: - """Test the HardExclusionRules patterns.""" - def test_dos_patterns(self): - """Test DOS pattern exclusions.""" - from claudecode.findings_filter import HardExclusionRules - - dos_findings = [ - {'description': 'Potential denial of service vulnerability', 'category': 'security'}, - {'description': 'DOS attack through resource exhaustion', 'category': 'security'}, - {'description': 'Infinite loop causing resource exhaustion', 'category': 'security'}, - ] - - for finding in dos_findings: - reason = HardExclusionRules.get_exclusion_reason(finding) - assert reason is not None - assert 'dos' in reason.lower() - - def test_rate_limiting_patterns(self): - """Test rate limiting pattern exclusions.""" from claudecode.findings_filter import HardExclusionRules - - rate_limit_findings = [ - {'description': 'Missing rate limiting on endpoint', 'category': 'security'}, - {'description': 'No rate limit implemented for API', 'category': 'security'}, - {'description': 'Implement rate limiting for this route', 'category': 'security'}, - ] - - for finding in rate_limit_findings: - reason = HardExclusionRules.get_exclusion_reason(finding) - assert reason is not None - assert 'rate limit' in reason.lower() - - def test_open_redirect_patterns(self): - """Test open redirect pattern exclusions.""" - from claudecode.findings_filter import HardExclusionRules - - redirect_findings = [ - {'description': 'Open redirect vulnerability found', 'category': 'security'}, - {'description': 'Unvalidated redirect in URL parameter', 'category': 'security'}, - {'description': 'Redirect attack possible through user input', 'category': 'security'}, - ] - - for finding in redirect_findings: + + for finding in [ + {"description": "Potential denial of service vulnerability", "category": "security"}, + {"description": "DOS attack through resource exhaustion", "category": "security"}, + {"description": "Infinite loop causing resource exhaustion", "category": "security"}, + ]: reason = HardExclusionRules.get_exclusion_reason(finding) assert reason is not None - assert 'open redirect' in reason.lower() - + assert "dos" in reason.lower() + def test_markdown_file_exclusion(self): - """Test that findings in .md files are excluded.""" from claudecode.findings_filter import HardExclusionRules - - md_findings = [ - {'file': 'README.md', 'description': 'SQL injection vulnerability', 'category': 'security'}, - {'file': 'docs/security.md', 'description': 'Command injection found', 'category': 'security'}, - {'file': 'CHANGELOG.MD', 'description': 'XSS vulnerability', 'category': 'security'}, # Test case insensitive - {'file': 'path/to/file.Md', 'description': 'Path traversal', 'category': 'security'}, # Mixed case - ] - - for finding in md_findings: + + for finding in [ + {"file": "README.md", "description": "SQL injection vulnerability", "category": "security"}, + {"file": "docs/security.md", "description": "Command injection found", "category": "security"}, + ]: reason = HardExclusionRules.get_exclusion_reason(finding) assert reason is not None - assert 'markdown' in reason.lower() - - def test_non_markdown_files_not_excluded(self): - """Test that findings in non-.md files are not excluded due to file extension.""" - from claudecode.findings_filter import HardExclusionRules - - non_md_findings = [ - {'file': 'main.py', 'description': 'SQL injection vulnerability'}, - {'file': 'server.js', 'description': 'Command injection found'}, - {'file': 'index.html', 'description': 'XSS vulnerability'}, - {'file': 'config.yml', 'description': 'Hardcoded credentials'}, - {'file': 'README.txt', 'description': 'Path traversal'}, - {'file': 'file.mdx', 'description': 'Security issue'}, # Not .md - ] - - for finding in non_md_findings: - reason = HardExclusionRules.get_exclusion_reason(finding) - # Should not be excluded for being a markdown file - # (might be excluded for other reasons like DOS patterns) - if reason: - assert 'markdown' not in reason.lower() - - def test_keeps_real_vulnerabilities(self): - """Test that real vulnerabilities are not excluded.""" - from claudecode.findings_filter import HardExclusionRules - - real_vulns = [ - {'file': 'auth.py', 'description': 'SQL injection in user authentication', 'category': 'security'}, - {'file': 'exec.js', 'description': 'Command injection through user input', 'category': 'security'}, - {'file': 'comments.php', 'description': 'Cross-site scripting in comment field', 'category': 'security'}, - {'file': 'upload.go', 'description': 'Path traversal in file upload', 'category': 'security'}, - ] - - for finding in real_vulns: - reason = HardExclusionRules.get_exclusion_reason(finding) - assert reason is None + assert "markdown" in reason.lower() class TestJSONParser: - """Test JSON parsing utilities.""" - def test_parse_valid_json(self): - """Test parsing valid JSON.""" from claudecode.json_parser import parse_json_with_fallbacks - - valid_json = '{"test": "data", "number": 123}' - success, result = parse_json_with_fallbacks(valid_json, "test") - + + success, result = parse_json_with_fallbacks('{"test": "data", "number": 123}', "test") assert success is True assert result == {"test": "data", "number": 123} - - def test_parse_invalid_json(self): - """Test parsing invalid JSON.""" - from claudecode.json_parser import parse_json_with_fallbacks - - invalid_json = '{invalid json}' - success, result = parse_json_with_fallbacks(invalid_json, "test") - - assert success is False - assert 'error' in result - assert 'Invalid JSON response' in result['error'] - + def test_extract_json_from_text(self): - """Test extracting JSON from mixed text.""" - from claudecode.json_parser import extract_json_from_text - - mixed_text = 'Some text before {"key": "value"} some text after' - result = extract_json_from_text(mixed_text) - - assert result == {"key": "value"} - - def test_extract_json_from_text_no_json(self): - """Test extracting JSON when none exists.""" from claudecode.json_parser import extract_json_from_text - - plain_text = 'This is just plain text with no JSON' - result = extract_json_from_text(plain_text) - - assert result is None - - -class TestPromptsModule: - """Test the prompts module.""" - - def test_get_unified_review_prompt(self): - """Test unified review prompt generation.""" - from claudecode.prompts import get_unified_review_prompt - - pr_data = { - 'number': 123, - 'title': 'Test PR', - 'body': 'Test description', - 'user': 'testuser', - 'changed_files': 1, - 'additions': 10, - 'deletions': 5, - 'head': { - 'repo': { - 'full_name': 'owner/repo' - } - }, - 'files': [ - { - 'filename': 'test.py', - 'status': 'modified', - 'additions': 10, - 'deletions': 5, - 'patch': '@@ -1,5 +1,10 @@\n+added line' - } - ] - } - - pr_diff = "diff --git a/test.py b/test.py\n+added line" - - prompt = get_unified_review_prompt(pr_data, pr_diff) - - assert isinstance(prompt, str) - assert 'security' in prompt.lower() - assert 'PR #123' in prompt - assert 'test.py' in prompt - -class TestDeploymentPRDetection: - """Test deployment PR title pattern matching.""" - - def test_deployment_pr_patterns(self): - """Test that deployment PR titles are correctly identified.""" - import re - - deployment_pattern = r'^Deploy\s+[a-f0-9]{6,}\s+to\s+(production|staging|development|production-services)' - - # These should match - deployment_titles = [ - "Deploy 53f395b0 to production-services", - "Deploy af179b5b to production", - "Deploy 1a3cb909 to production", - "Deploy 49c09ea5 to production-services", - "Deploy 8e7acc60 to production", - "Deploy e0b1fe0b to production-services", - "Deploy c53e6010 to production", - "Deploy 42c4a061 to production", - "Deploy 9de55976 to production-services", - "deploy abcdef123456 to staging", # lowercase should work - "DEPLOY ABCDEF01 TO DEVELOPMENT", # uppercase should work - ] - - for title in deployment_titles: - assert re.match(deployment_pattern, title, re.IGNORECASE), f"Failed to match deployment PR: {title}" - - def test_non_deployment_pr_patterns(self): - """Test that non-deployment PR titles are not matched.""" - import re - - deployment_pattern = r'^Deploy\s+[a-f0-9]{6,}\s+to\s+(production|staging|development|production-services)' - - # These should NOT match - non_deployment_titles = [ - "Add new feature", - "Fix bug in deployment script", - "Update deployment documentation", - "Deploy new feature to production", # No commit hash - "Deploy abc to production", # Too short hash - "Deploy 12345g to production", # Non-hex character - "Preparing deploy af179b5b to production", # Doesn't start with Deploy - "Deploy af179b5b to testing", # Wrong environment - "Deploy af179b5b", # Missing environment - "af179b5b to production", # Missing Deploy prefix - ] - - for title in non_deployment_titles: - assert not re.match(deployment_pattern, title, re.IGNORECASE), f"Incorrectly matched non-deployment PR: {title}" + assert extract_json_from_text('Some text before {"key": "value"} some text after') == {"key": "value"} class TestBuiltinExclusions: - """Test built-in file and directory exclusions.""" - def test_builtin_excluded_directories(self): - """Test that built-in directories are in the exclusion list.""" - from claudecode.github_action_audit import GitHubActionClient - - expected_dirs = [ - 'node_modules', - 'vendor', - 'dist', - 'build', - '.next', - '__pycache__', - '.gradle', - 'Pods', - 'DerivedData', - ] - - for dir_name in expected_dirs: - assert dir_name in GitHubActionClient.BUILTIN_EXCLUDED_DIRS, f"Missing built-in excluded dir: {dir_name}" - - def test_builtin_excluded_patterns(self): - """Test that built-in file patterns are in the exclusion list.""" from claudecode.github_action_audit import GitHubActionClient - expected_patterns = [ - 'package-lock.json', - 'yarn.lock', - '*.min.js', - '*.min.css', - '*.pb.go', - '*.generated.*', - '*.png', - '*.jpg', - '*.woff2', - '*.pyc', - ] - - for pattern in expected_patterns: - assert pattern in GitHubActionClient.BUILTIN_EXCLUDED_PATTERNS, f"Missing built-in excluded pattern: {pattern}" - - def test_is_excluded_lock_files(self): - """Test that lock files are excluded.""" - from claudecode.github_action_audit import GitHubActionClient - from unittest.mock import patch - - with patch.dict('os.environ', {'GITHUB_TOKEN': 'test-token', 'EXCLUDE_DIRECTORIES': ''}): - client = GitHubActionClient() - - lock_files = [ - 'package-lock.json', - 'yarn.lock', - 'Gemfile.lock', - 'poetry.lock', - 'Cargo.lock', - 'go.sum', - 'nested/path/package-lock.json', - ] - - for filepath in lock_files: - assert client._is_excluded(filepath), f"Lock file should be excluded: {filepath}" + for dir_name in ["node_modules", "vendor", "dist", "build", ".next", "__pycache__", ".gradle", "Pods", "DerivedData"]: + assert dir_name in GitHubActionClient.BUILTIN_EXCLUDED_DIRS - def test_is_excluded_generated_files(self): - """Test that generated files are excluded.""" - from claudecode.github_action_audit import GitHubActionClient + def test_is_excluded_source_files(self): from unittest.mock import patch - with patch.dict('os.environ', {'GITHUB_TOKEN': 'test-token', 'EXCLUDE_DIRECTORIES': ''}): - client = GitHubActionClient() - - generated_files = [ - 'app.min.js', - 'styles.min.css', - 'app.bundle.js', - 'main.chunk.js', - 'api.pb.go', - 'models.generated.ts', - 'user.g.dart', - ] - - for filepath in generated_files: - assert client._is_excluded(filepath), f"Generated file should be excluded: {filepath}" - - def test_is_excluded_binary_files(self): - """Test that binary files are excluded.""" from claudecode.github_action_audit import GitHubActionClient - from unittest.mock import patch - with patch.dict('os.environ', {'GITHUB_TOKEN': 'test-token', 'EXCLUDE_DIRECTORIES': ''}): + with patch.dict("os.environ", {"GITHUB_TOKEN": "test-token", "EXCLUDE_DIRECTORIES": ""}): client = GitHubActionClient() - binary_files = [ - 'logo.png', - 'photo.jpg', - 'icon.ico', - 'font.woff2', - 'document.pdf', - 'archive.zip', - ] - - for filepath in binary_files: - assert client._is_excluded(filepath), f"Binary file should be excluded: {filepath}" + assert not client.is_excluded("src/main.py") + assert not client.is_excluded("tests/test_auth.py") - def test_is_excluded_vendor_directories(self): - """Test that vendor directories are excluded.""" - from claudecode.github_action_audit import GitHubActionClient - from unittest.mock import patch - - with patch.dict('os.environ', {'GITHUB_TOKEN': 'test-token', 'EXCLUDE_DIRECTORIES': ''}): - client = GitHubActionClient() - - vendor_paths = [ - 'node_modules/lodash/index.js', - 'vendor/github.com/pkg/errors/errors.go', - 'dist/bundle.js', - 'build/output.js', - '.next/cache/data.json', - '__pycache__/module.pyc', - 'Pods/AFNetworking/Source.m', - ] - - for filepath in vendor_paths: - assert client._is_excluded(filepath), f"Vendor path should be excluded: {filepath}" - - def test_is_not_excluded_source_files(self): - """Test that regular source files are NOT excluded.""" - from claudecode.github_action_audit import GitHubActionClient + def test_user_exclusions_combined_with_builtin(self): from unittest.mock import patch - with patch.dict('os.environ', {'GITHUB_TOKEN': 'test-token', 'EXCLUDE_DIRECTORIES': ''}): - client = GitHubActionClient() - - source_files = [ - 'src/main.py', - 'lib/utils.js', - 'app/models/user.rb', - 'pkg/handler/api.go', - 'src/components/Button.tsx', - 'tests/test_auth.py', - ] - - for filepath in source_files: - assert not client._is_excluded(filepath), f"Source file should NOT be excluded: {filepath}" - - def test_user_exclusions_combined_with_builtin(self): - """Test that user exclusions are combined with built-in exclusions.""" from claudecode.github_action_audit import GitHubActionClient - from unittest.mock import patch - with patch.dict('os.environ', {'GITHUB_TOKEN': 'test-token', 'EXCLUDE_DIRECTORIES': 'custom_dir,my_vendor'}): + with patch.dict("os.environ", {"GITHUB_TOKEN": "test-token", "EXCLUDE_DIRECTORIES": "custom_dir,my_vendor"}): client = GitHubActionClient() - # Built-in should still work - assert client._is_excluded('node_modules/pkg/index.js') - assert client._is_excluded('vendor/lib/code.go') - - # User exclusions should also work - assert client._is_excluded('custom_dir/file.py') - assert client._is_excluded('my_vendor/lib.js') - - -class TestExtractReviewFindings: - """Test the _extract_review_findings method.""" - - def test_extract_findings_with_pr_summary(self): - """Test that pr_summary is extracted from Claude output.""" - import json - from claudecode.github_action_audit import SimpleClaudeRunner - - runner = SimpleClaudeRunner() - - claude_output = { - 'result': json.dumps({ - 'findings': [ - {'file': 'test.py', 'line': 1, 'severity': 'HIGH', 'description': 'Test finding'} - ], - 'pr_summary': { - 'overview': 'This PR adds authentication middleware.', - 'file_changes': [ - {'label': 'src/auth.py', 'files': ['src/auth.py'], 'changes': 'Added JWT validation'} - ] - } - }) - } - - result = runner._extract_review_findings(claude_output) - - assert 'pr_summary' in result - assert result['pr_summary']['overview'] == 'This PR adds authentication middleware.' - assert len(result['pr_summary']['file_changes']) == 1 - assert result['pr_summary']['file_changes'][0]['files'] == ['src/auth.py'] - assert len(result['findings']) == 1 - - def test_extract_findings_without_pr_summary(self): - """Test that missing pr_summary defaults to empty dict.""" - import json - from claudecode.github_action_audit import SimpleClaudeRunner - - runner = SimpleClaudeRunner() - - claude_output = { - 'result': json.dumps({ - 'findings': [] - }) - } - - result = runner._extract_review_findings(claude_output) - - assert 'pr_summary' in result - assert result['pr_summary'] == {} - - def test_extract_findings_empty_result(self): - """Test extraction with invalid/empty Claude output.""" - from claudecode.github_action_audit import SimpleClaudeRunner - - runner = SimpleClaudeRunner() - - result = runner._extract_review_findings({}) - - assert 'pr_summary' in result - assert result['pr_summary'] == {} - assert result['findings'] == [] - - def test_extract_findings_with_grouped_files(self): - """Test that grouped file_changes are handled correctly.""" - import json - from claudecode.github_action_audit import SimpleClaudeRunner - - runner = SimpleClaudeRunner() - - claude_output = { - 'result': json.dumps({ - 'findings': [], - 'pr_summary': { - 'overview': 'Test updates.', - 'file_changes': [ - { - 'label': 'tests/test_*.py', - 'files': ['tests/test_auth.py', 'tests/test_login.py', 'tests/test_session.py'], - 'changes': 'Unit tests for auth module' - } - ] - } - }) - } - - result = runner._extract_review_findings(claude_output) - - assert 'pr_summary' in result - file_changes = result['pr_summary']['file_changes'] - assert len(file_changes) == 1 - assert file_changes[0]['label'] == 'tests/test_*.py' - assert len(file_changes[0]['files']) == 3 + assert client.is_excluded("node_modules/pkg/index.js") + assert client.is_excluded("custom_dir/file.py") class TestDiffSizeLimits: - """Test diff size limit functionality.""" - def test_diff_line_counting(self): - """Test that diff lines are counted correctly.""" sample_diff = """diff --git a/file.py b/file.py --- a/file.py +++ b/file.py @@ -534,34 +102,4 @@ def test_diff_line_counting(self): +replaced line 5 line 6""" - line_count = len(sample_diff.splitlines()) - assert line_count == 11 # Count the actual lines - - def test_max_diff_lines_env_parsing(self): - """Test that MAX_DIFF_LINES environment variable is parsed correctly.""" - import os - from unittest.mock import patch - - # Test default value when env var is not set - with patch.dict('os.environ', {}, clear=False): - os.environ.pop('MAX_DIFF_LINES', None) - max_lines_str = os.environ.get('MAX_DIFF_LINES', '5000') - try: - max_lines = int(max_lines_str) - except ValueError: - max_lines = 5000 - - assert max_lines == 5000 # Default when not set - - def test_max_diff_lines_zero_forces_agentic_mode(self): - """Test that setting MAX_DIFF_LINES to 0 forces agentic file reading mode.""" - import os - from unittest.mock import patch - - with patch.dict('os.environ', {'MAX_DIFF_LINES': '0'}): - max_lines_str = os.environ.get('MAX_DIFF_LINES', '5000') - max_lines = int(max_lines_str) - - # When max_lines is 0, agentic mode is always used - assert max_lines == 0 - # In the actual code: use_agentic_mode = max_diff_lines == 0 or ... + assert len(sample_diff.splitlines()) == 11 diff --git a/claudecode/test_helper_functions.py b/claudecode/test_helper_functions.py index ad60728..e124ec8 100644 --- a/claudecode/test_helper_functions.py +++ b/claudecode/test_helper_functions.py @@ -9,10 +9,8 @@ get_environment_config, initialize_clients, initialize_findings_filter, - run_code_review, apply_findings_filter, ConfigurationError, - AuditError ) from claudecode.findings_filter import FindingsFilter @@ -145,34 +143,6 @@ def test_initialize_findings_filter_with_defaults(self, mock_simple_filter): assert result == mock_filter_instance - def test_run_code_review_success(self): - """Test successful code review execution.""" - mock_runner = MagicMock() - mock_runner.run_code_review.return_value = ( - True, - "", - {"findings": [{"id": 1}], "analysis_summary": {}} - ) - - result = run_code_review(mock_runner, "test prompt") - - assert result == {"findings": [{"id": 1}], "analysis_summary": {}} - mock_runner.run_code_review.assert_called_once() - - def test_run_code_review_failure(self): - """Test code review execution failure.""" - mock_runner = MagicMock() - mock_runner.run_code_review.return_value = ( - False, - "Audit failed: timeout", - {} - ) - - with pytest.raises(AuditError) as exc_info: - run_code_review(mock_runner, "test prompt") - - assert "Code review failed: Audit failed: timeout" in str(exc_info.value) - def test_apply_findings_filter_with_findings_filter(self): """Test applying FindingsFilter to findings.""" mock_filter = MagicMock(spec=FindingsFilter) diff --git a/claudecode/test_main_function.py b/claudecode/test_main_function.py index f5239d2..7c46ec0 100644 --- a/claudecode/test_main_function.py +++ b/claudecode/test_main_function.py @@ -1,26 +1,24 @@ #!/usr/bin/env python3 -""" -Unit tests for main function and full workflow. -""" +"""Unit tests for main function and multi-phase workflow.""" -import pytest import json import os -from unittest.mock import Mock, patch from pathlib import Path +from unittest.mock import Mock, patch + +import pytest from claudecode.github_action_audit import main def _mock_multiphase_success(mock_runner, findings=None): - """Configure runner mock for strict multi-phase orchestration.""" findings = findings or [] mock_runner.run_prompt.side_effect = [ - (True, "", {"skip_review": False, "reason": "", "risk_level": "medium"}), # triage - (True, "", {"claude_md_files": [], "change_summary": "", "hotspots": [], "priority_files": []}), # context - (True, "", {"findings": findings}), # compliance - (True, "", {"findings": []}), # quality - (True, "", {"findings": []}), # security + (True, "", {"skip_review": False, "reason": "", "risk_level": "medium"}), + (True, "", {"claude_md_files": [], "change_summary": "", "hotspots": [], "priority_files": []}), + (True, "", {"findings": findings}), + (True, "", {"findings": []}), + (True, "", {"findings": []}), ( True, "", @@ -30,184 +28,23 @@ def _mock_multiphase_success(mock_runner, findings=None): for idx in range(len(findings)) ] }, - ), # validation + ), ] class TestMainFunction: - """Test main function execution flow.""" - def test_main_missing_environment_vars(self, capsys): - """Test main with missing environment variables.""" with patch.dict(os.environ, {}, clear=True): with pytest.raises(SystemExit) as exc_info: main() - assert exc_info.value.code == 2 # EXIT_CONFIGURATION_ERROR - - captured = capsys.readouterr() - output = json.loads(captured.out) - assert 'error' in output - assert 'GITHUB_REPOSITORY' in output['error'] - - def test_main_missing_pr_number(self, capsys): - """Test main with missing PR number.""" - with patch.dict(os.environ, {'GITHUB_REPOSITORY': 'owner/repo'}, clear=True): - with pytest.raises(SystemExit) as exc_info: - main() - - assert exc_info.value.code == 2 # EXIT_CONFIGURATION_ERROR - - captured = capsys.readouterr() - output = json.loads(captured.out) - assert 'PR_NUMBER' in output['error'] - - def test_main_invalid_pr_number(self, capsys): - """Test main with invalid PR number.""" - with patch.dict(os.environ, { - 'GITHUB_REPOSITORY': 'owner/repo', - 'PR_NUMBER': 'not-a-number' - }, clear=True): - with pytest.raises(SystemExit) as exc_info: - main() - - assert exc_info.value.code == 2 # EXIT_CONFIGURATION_ERROR - - captured = capsys.readouterr() - output = json.loads(captured.out) - assert 'Invalid PR_NUMBER' in output['error'] - - @patch('claudecode.github_action_audit.GitHubActionClient') - def test_main_github_client_init_failure(self, mock_client_class, capsys): - """Test main when GitHub client initialization fails.""" - mock_client_class.side_effect = Exception("Token invalid") - - with patch.dict(os.environ, { - 'GITHUB_REPOSITORY': 'owner/repo', - 'PR_NUMBER': '123', - 'GITHUB_TOKEN': 'invalid' - }): - with pytest.raises(SystemExit) as exc_info: - main() - - assert exc_info.value.code == 2 # EXIT_CONFIGURATION_ERROR - - captured = capsys.readouterr() - output = json.loads(captured.out) - assert 'Failed to initialize GitHub client' in output['error'] - assert 'Token invalid' in output['error'] - - @patch('claudecode.github_action_audit.SimpleClaudeRunner') - @patch('claudecode.github_action_audit.GitHubActionClient') - def test_main_claude_runner_init_failure(self, mock_client_class, mock_runner_class, capsys): - """Test main when Claude runner initialization fails.""" - mock_client_class.return_value = Mock() - mock_runner_class.side_effect = Exception("Runner error") - - with patch.dict(os.environ, { - 'GITHUB_REPOSITORY': 'owner/repo', - 'PR_NUMBER': '123', - 'GITHUB_TOKEN': 'test-token' - }): - with pytest.raises(SystemExit) as exc_info: - main() - - assert exc_info.value.code == 2 # EXIT_CONFIGURATION_ERROR - - captured = capsys.readouterr() - output = json.loads(captured.out) - assert 'Failed to initialize Claude runner' in output['error'] - - @patch('claudecode.github_action_audit.FindingsFilter') - @patch('claudecode.github_action_audit.SimpleClaudeRunner') - @patch('claudecode.github_action_audit.GitHubActionClient') - def test_main_filter_initialization(self, mock_client_class, mock_runner_class, - mock_full_filter_class): - """Test filter initialization logic.""" - # Setup mocks - mock_client = Mock() - mock_client_class.return_value = mock_client - - mock_runner = Mock() - mock_runner.validate_claude_available.return_value = (False, "Not available") - mock_runner_class.return_value = mock_runner - - # Test with full filtering enabled - with patch.dict(os.environ, { - 'GITHUB_REPOSITORY': 'owner/repo', - 'PR_NUMBER': '123', - 'GITHUB_TOKEN': 'test-token', - 'ENABLE_CLAUDE_FILTERING': 'true', - 'ENABLE_HEURISTIC_FILTERING': 'true', - 'ANTHROPIC_API_KEY': 'test-api-key' - }): - with pytest.raises(SystemExit): - main() - - # Should initialize full filter - mock_full_filter_class.assert_called_once() - call_kwargs = mock_full_filter_class.call_args[1] - assert call_kwargs['use_hard_exclusions'] is True - assert call_kwargs['use_claude_filtering'] is True - assert call_kwargs['api_key'] == 'test-api-key' - - # Reset mocks - mock_full_filter_class.reset_mock() - - # Test with filtering disabled - with patch.dict(os.environ, { - 'GITHUB_REPOSITORY': 'owner/repo', - 'PR_NUMBER': '123', - 'GITHUB_TOKEN': 'test-token', - 'ENABLE_HEURISTIC_FILTERING': 'true', - 'ENABLE_CLAUDE_FILTERING': 'false' - }): - with pytest.raises(SystemExit): - main() - - # Should use FindingsFilter with hard exclusions only - mock_full_filter_class.assert_called_once() - call_kwargs = mock_full_filter_class.call_args[1] - assert call_kwargs['use_hard_exclusions'] is True - assert call_kwargs['use_claude_filtering'] is False - - @patch('claudecode.github_action_audit.FindingsFilter') - @patch('claudecode.github_action_audit.SimpleClaudeRunner') - @patch('claudecode.github_action_audit.GitHubActionClient') - def test_main_claude_not_available(self, mock_client_class, mock_runner_class, - mock_filter_class, capsys): - """Test when Claude is not available.""" - mock_client = Mock() - mock_client_class.return_value = mock_client - - mock_runner = Mock() - mock_runner.validate_claude_available.return_value = (False, "Claude not installed") - mock_runner_class.return_value = mock_runner - - mock_filter_class.return_value = Mock() - - with patch.dict(os.environ, { - 'GITHUB_REPOSITORY': 'owner/repo', - 'PR_NUMBER': '123', - 'GITHUB_TOKEN': 'test-token' - }): - with pytest.raises(SystemExit) as exc_info: - main() - - assert exc_info.value.code == 1 # Claude not available, exit 1 + assert exc_info.value.code == 2 + output = json.loads(capsys.readouterr().out) + assert "GITHUB_REPOSITORY" in output["error"] - captured = capsys.readouterr() - output = json.loads(captured.out) - assert 'Claude Code not available' in output['error'] - assert 'Claude not installed' in output['error'] - - @patch('claudecode.github_action_audit.get_unified_review_prompt') - @patch('claudecode.github_action_audit.FindingsFilter') - @patch('claudecode.github_action_audit.SimpleClaudeRunner') - @patch('claudecode.github_action_audit.GitHubActionClient') - def test_main_pr_data_fetch_failure(self, mock_client_class, mock_runner_class, - mock_filter_class, mock_prompt_func, capsys): - """Test when PR data fetch fails.""" + @patch("claudecode.github_action_audit.SimpleClaudeRunner") + @patch("claudecode.github_action_audit.GitHubActionClient") + def test_main_pr_data_fetch_failure(self, mock_client_class, mock_runner_class, capsys): mock_client = Mock() mock_client.get_pr_data.side_effect = Exception("API error") mock_client_class.return_value = mock_client @@ -216,40 +53,23 @@ def test_main_pr_data_fetch_failure(self, mock_client_class, mock_runner_class, mock_runner.validate_claude_available.return_value = (True, "") mock_runner_class.return_value = mock_runner - mock_filter_class.return_value = Mock() - - with patch.dict(os.environ, { - 'GITHUB_REPOSITORY': 'owner/repo', - 'PR_NUMBER': '123', - 'GITHUB_TOKEN': 'test-token' - }): + with patch.dict(os.environ, {"GITHUB_REPOSITORY": "owner/repo", "PR_NUMBER": "123", "GITHUB_TOKEN": "test-token"}): with pytest.raises(SystemExit) as exc_info: main() - assert exc_info.value.code == 1 # API error, exit 1 + assert exc_info.value.code == 1 + output = json.loads(capsys.readouterr().out) + assert "Failed to fetch PR data" in output["error"] - captured = capsys.readouterr() - output = json.loads(captured.out) - assert 'Failed to fetch PR data' in output['error'] - assert 'API error' in output['error'] - - @patch('pathlib.Path.cwd') - @patch('claudecode.github_action_audit.get_unified_review_prompt') - @patch('claudecode.github_action_audit.FindingsFilter') - @patch('claudecode.github_action_audit.SimpleClaudeRunner') - @patch('claudecode.github_action_audit.GitHubActionClient') - def test_main_successful_audit_no_findings(self, mock_client_class, mock_runner_class, - mock_filter_class, mock_prompt_func, - mock_cwd, capsys): - """Test successful audit with no findings.""" - # Setup mocks + @patch("pathlib.Path.cwd") + @patch("claudecode.github_action_audit.FindingsFilter") + @patch("claudecode.github_action_audit.SimpleClaudeRunner") + @patch("claudecode.github_action_audit.GitHubActionClient") + def test_main_successful_audit_no_findings(self, mock_client_class, mock_runner_class, mock_filter_class, mock_cwd, capsys): mock_client = Mock() - mock_client.get_pr_data.return_value = { - 'number': 123, - 'title': 'Test PR', - 'body': 'Description' - } + mock_client.get_pr_data.return_value = {"number": 123, "title": "Test PR", "body": "Description"} mock_client.get_pr_diff.return_value = "diff content" + mock_client.is_excluded.return_value = False mock_client_class.return_value = mock_client mock_runner = Mock() @@ -258,73 +78,39 @@ def test_main_successful_audit_no_findings(self, mock_client_class, mock_runner_ mock_runner_class.return_value = mock_runner mock_filter = Mock() - mock_filter.filter_findings.return_value = ( - True, - { - 'filtered_findings': [], - 'excluded_findings': [], - 'analysis_summary': {} - }, - Mock() # filter_stats - ) + mock_filter.filter_findings.return_value = (True, {"filtered_findings": [], "excluded_findings": [], "analysis_summary": {}}, Mock()) mock_filter_class.return_value = mock_filter - mock_prompt_func.return_value = "unified review prompt" - mock_cwd.return_value = Path('/tmp/repo') + mock_cwd.return_value = Path("/tmp/repo") - with patch.dict(os.environ, { - 'GITHUB_REPOSITORY': 'owner/repo', - 'PR_NUMBER': '123', - 'GITHUB_TOKEN': 'test-token' - }): + with patch.dict(os.environ, {"GITHUB_REPOSITORY": "owner/repo", "PR_NUMBER": "123", "GITHUB_TOKEN": "test-token"}): with pytest.raises(SystemExit) as exc_info: main() - assert exc_info.value.code == 0 # No findings, exit 0 - - captured = capsys.readouterr() - output = json.loads(captured.out) - - assert output['pr_number'] == 123 - assert output['repo'] == 'owner/repo' - assert len(output['findings']) == 0 - assert output['filtering_summary']['total_original_findings'] == 0 - - @patch('pathlib.Path.cwd') - @patch('claudecode.github_action_audit.get_unified_review_prompt') - @patch('claudecode.github_action_audit.FindingsFilter') - @patch('claudecode.github_action_audit.SimpleClaudeRunner') - @patch('claudecode.github_action_audit.GitHubActionClient') - def test_main_successful_audit_with_findings(self, mock_client_class, mock_runner_class, - mock_filter_class, mock_prompt_func, - mock_cwd, capsys): - """Test successful audit with high severity findings.""" - # Setup mocks + assert exc_info.value.code == 0 + output = json.loads(capsys.readouterr().out) + assert output["pr_number"] == 123 + assert output["repo"] == "owner/repo" + assert output["findings"] == [] + + @patch("pathlib.Path.cwd") + @patch("claudecode.github_action_audit.FindingsFilter") + @patch("claudecode.github_action_audit.SimpleClaudeRunner") + @patch("claudecode.github_action_audit.GitHubActionClient") + def test_main_successful_audit_with_high_finding(self, mock_client_class, mock_runner_class, mock_filter_class, mock_cwd, capsys): mock_client = Mock() - mock_client.get_pr_data.return_value = { - 'number': 123, - 'title': 'Test PR', - 'body': 'Description' - } + mock_client.get_pr_data.return_value = {"number": 123, "title": "Test PR", "body": "Description"} mock_client.get_pr_diff.return_value = "diff content" - mock_client._is_excluded.return_value = False # Don't exclude any files in tests mock_client.is_excluded.return_value = False mock_client_class.return_value = mock_client findings = [ { - 'file': 'test.py', - 'line': 10, - 'severity': 'HIGH', - 'category': 'security', - 'description': 'SQL injection' - }, - { - 'file': 'main.py', - 'line': 20, - 'severity': 'MEDIUM', - 'category': 'correctness', - 'description': 'Logic error' + "file": "test.py", + "line": 10, + "severity": "HIGH", + "category": "security", + "description": "SQL injection", } ] @@ -333,206 +119,30 @@ def test_main_successful_audit_with_findings(self, mock_client_class, mock_runne _mock_multiphase_success(mock_runner, findings=findings) mock_runner_class.return_value = mock_runner - # Filter keeps only high severity mock_filter = Mock() - mock_filter.filter_findings.return_value = ( - True, - { - 'filtered_findings': [findings[0]], - 'excluded_findings': [findings[1]], - 'analysis_summary': { - 'total_findings': 2, - 'kept_findings': 1, - 'excluded_findings': 1 - } - }, - Mock() # filter_stats - ) + mock_filter.filter_findings.return_value = (True, {"filtered_findings": findings, "excluded_findings": [], "analysis_summary": {}}, Mock()) mock_filter_class.return_value = mock_filter - mock_prompt_func.return_value = "unified review prompt" - mock_cwd.return_value = Path('/tmp/repo') + mock_cwd.return_value = Path("/tmp/repo") - with patch.dict(os.environ, { - 'GITHUB_REPOSITORY': 'owner/repo', - 'PR_NUMBER': '123', - 'GITHUB_TOKEN': 'test-token' - }): + with patch.dict(os.environ, {"GITHUB_REPOSITORY": "owner/repo", "PR_NUMBER": "123", "GITHUB_TOKEN": "test-token"}): with pytest.raises(SystemExit) as exc_info: main() - assert exc_info.value.code == 1 # High severity finding, exit 1 - - captured = capsys.readouterr() - output = json.loads(captured.out) - - assert len(output['findings']) == 1 - assert output['findings'][0]['severity'] == 'HIGH' - assert output['filtering_summary']['total_original_findings'] == 2 - assert output['filtering_summary']['excluded_findings'] == 1 - assert output['filtering_summary']['kept_findings'] == 1 - - @patch('pathlib.Path.cwd') - @patch('claudecode.github_action_audit.get_unified_review_prompt') - @patch('claudecode.github_action_audit.SimpleClaudeRunner') - @patch('claudecode.github_action_audit.GitHubActionClient') - def test_main_with_full_filter(self, mock_client_class, mock_runner_class, - mock_prompt_func, mock_cwd, capsys): - """Test main with full FindingsFilter (LLM-based).""" - # Setup mocks - mock_client = Mock() - mock_client.get_pr_data.return_value = { - 'number': 123, - 'title': 'Test PR', - 'body': 'Description' - } - mock_client.get_pr_diff.return_value = "diff content" - mock_client._is_excluded.return_value = False # Don't exclude any files in tests - mock_client.is_excluded.return_value = False - mock_client_class.return_value = mock_client - - findings = [{'file': 'test.py', 'line': 10, 'severity': 'HIGH', 'description': 'Issue'}] - - mock_runner = Mock() - mock_runner.validate_claude_available.return_value = (True, "") - _mock_multiphase_success(mock_runner, findings=findings) - mock_runner_class.return_value = mock_runner - - mock_prompt_func.return_value = "prompt" - mock_cwd.return_value = Path('/tmp') - - # Mock FindingsFilter to return findings properly - mock_filter = Mock() - mock_filter.filter_findings.return_value = ( - True, - { - 'filtered_findings': findings, - 'excluded_findings': [], - 'analysis_summary': { - 'total_findings': 1, - 'kept_findings': 1, - 'excluded_findings': 0 - } - }, - Mock() # filter_stats - ) - - with patch('claudecode.github_action_audit.FindingsFilter', return_value=mock_filter): - with patch.dict(os.environ, { - 'GITHUB_REPOSITORY': 'owner/repo', - 'PR_NUMBER': '123', - 'GITHUB_TOKEN': 'test-token', - 'ENABLE_HEURISTIC_FILTERING': 'true', - 'ENABLE_CLAUDE_FILTERING': 'false', # Use simple filter - }): - with pytest.raises(SystemExit): - main() - - captured = capsys.readouterr() - output = json.loads(captured.out) - - # Check that we got successful results - assert 'findings' in output - assert len(output['findings']) == 1 - assert output['findings'][0]['severity'] == 'HIGH' - - @patch('pathlib.Path.cwd') - @patch('claudecode.github_action_audit.get_unified_review_prompt') - @patch('claudecode.github_action_audit.SimpleClaudeRunner') - @patch('claudecode.github_action_audit.GitHubActionClient') - def test_main_filter_failure_keeps_all_findings(self, mock_client_class, mock_runner_class, - mock_prompt_func, mock_cwd, capsys): - """Test that filter failure keeps all findings with SimpleFindingsFilter.""" - # Setup mocks - mock_client = Mock() - mock_client.get_pr_data.return_value = {'number': 123, 'title': 'Test', 'body': ''} - mock_client.get_pr_diff.return_value = "diff" - mock_client._is_excluded.return_value = False # Don't exclude any files in tests - mock_client.is_excluded.return_value = False - mock_client_class.return_value = mock_client - - findings = [ - {'file': 'a.py', 'line': 1, 'severity': 'HIGH', 'description': 'Issue 1'}, - {'file': 'b.py', 'line': 2, 'severity': 'HIGH', 'description': 'Issue 2'} - ] - - mock_runner = Mock() - mock_runner.validate_claude_available.return_value = (True, "") - _mock_multiphase_success(mock_runner, findings=findings) - mock_runner_class.return_value = mock_runner - - mock_prompt_func.return_value = "prompt" - mock_cwd.return_value = Path('/tmp') - - # Mock FindingsFilter to keep all findings - mock_filter = Mock() - mock_filter.filter_findings.return_value = ( - True, - { - 'filtered_findings': findings, - 'excluded_findings': [], - 'analysis_summary': { - 'total_findings': 2, - 'kept_findings': 2, - 'excluded_findings': 0 - } - }, - Mock() # filter_stats - ) - - with patch('claudecode.github_action_audit.FindingsFilter', return_value=mock_filter): - with patch.dict(os.environ, { - 'GITHUB_REPOSITORY': 'owner/repo', - 'PR_NUMBER': '123', - 'GITHUB_TOKEN': 'test-token', - 'ENABLE_HEURISTIC_FILTERING': 'true', - 'ENABLE_CLAUDE_FILTERING': 'false', # Use simple filter - }): - with pytest.raises(SystemExit) as exc_info: - main() - - assert exc_info.value.code == 1 # Has HIGH findings - - captured = capsys.readouterr() - output = json.loads(captured.out) - - # All findings should be kept - assert len(output['findings']) == 2 - - def test_main_unexpected_error(self, capsys): - """Test unexpected error handling.""" - with patch('claudecode.github_action_audit.GitHubActionClient') as mock_class: - mock_class.side_effect = Exception("Unexpected error!") - - with patch.dict(os.environ, { - 'GITHUB_REPOSITORY': 'owner/repo', - 'PR_NUMBER': '123', - 'GITHUB_TOKEN': 'test-token' - }): - with pytest.raises(SystemExit) as exc_info: - main() - - assert exc_info.value.code == 2 # EXIT_CONFIGURATION_ERROR - - captured = capsys.readouterr() - output = json.loads(captured.out) - assert 'Unexpected error' in output['error'] + assert exc_info.value.code == 1 + output = json.loads(capsys.readouterr().out) + assert len(output["findings"]) == 1 + assert output["findings"][0]["review_type"] == "security" class TestAuditFailureModes: - """Test various audit failure scenarios.""" - - @patch('pathlib.Path.cwd') - @patch('claudecode.github_action_audit.get_unified_review_prompt') - @patch('claudecode.github_action_audit.FindingsFilter') - @patch('claudecode.github_action_audit.SimpleClaudeRunner') - @patch('claudecode.github_action_audit.GitHubActionClient') - def test_audit_failure(self, mock_client_class, mock_runner_class, - mock_filter_class, mock_prompt_func, - mock_cwd, capsys): - """Test when code review fails.""" + @patch("pathlib.Path.cwd") + @patch("claudecode.github_action_audit.FindingsFilter") + @patch("claudecode.github_action_audit.SimpleClaudeRunner") + @patch("claudecode.github_action_audit.GitHubActionClient") + def test_audit_failure(self, mock_client_class, mock_runner_class, mock_filter_class, mock_cwd, capsys): mock_client = Mock() - mock_client.get_pr_data.return_value = {'number': 123} + mock_client.get_pr_data.return_value = {"number": 123} mock_client.get_pr_diff.return_value = "diff" mock_client_class.return_value = mock_client @@ -542,139 +152,13 @@ def test_audit_failure(self, mock_client_class, mock_runner_class, mock_runner_class.return_value = mock_runner mock_filter_class.return_value = Mock() - mock_prompt_func.return_value = "prompt" - mock_cwd.return_value = Path('/tmp') + mock_cwd.return_value = Path("/tmp") - with patch.dict(os.environ, { - 'GITHUB_REPOSITORY': 'owner/repo', - 'PR_NUMBER': '123', - 'GITHUB_TOKEN': 'test-token' - }): + with patch.dict(os.environ, {"GITHUB_REPOSITORY": "owner/repo", "PR_NUMBER": "123", "GITHUB_TOKEN": "test-token"}): with pytest.raises(SystemExit) as exc_info: main() - assert exc_info.value.code == 1 # Audit failure, exit 1 - - captured = capsys.readouterr() - output = json.loads(captured.out) - assert 'Code review failed' in output['error'] - assert 'Claude execution failed' in output['error'] - - -class TestReviewTypeMetadata: - """Test that findings are tagged with correct review_type based on category.""" - - @patch('pathlib.Path.cwd') - @patch('claudecode.github_action_audit.get_unified_review_prompt') - @patch('claudecode.github_action_audit.FindingsFilter') - @patch('claudecode.github_action_audit.SimpleClaudeRunner') - @patch('claudecode.github_action_audit.GitHubActionClient') - def test_findings_have_correct_review_type(self, mock_client_class, mock_runner_class, - mock_filter_class, mock_prompt_func, - mock_cwd, capsys): - """Test that findings get review_type based on category.""" - mock_client = Mock() - mock_client.get_pr_data.return_value = {'number': 123, 'title': 'Test', 'body': ''} - mock_client.get_pr_diff.return_value = "diff" - mock_client._is_excluded.return_value = False - mock_client.is_excluded.return_value = False - mock_client_class.return_value = mock_client - - # Mixed findings - security and non-security - findings = [ - {'file': 'test.py', 'line': 1, 'severity': 'MEDIUM', 'category': 'correctness', 'description': 'Logic issue'}, - {'file': 'auth.py', 'line': 5, 'severity': 'HIGH', 'category': 'security', 'description': 'Security issue'} - ] - - mock_runner = Mock() - mock_runner.validate_claude_available.return_value = (True, "") - _mock_multiphase_success(mock_runner, findings=findings) - mock_runner_class.return_value = mock_runner - - # Filter passes through all findings - mock_filter = Mock() - mock_filter.filter_findings.return_value = ( - True, - {'filtered_findings': findings, 'excluded_findings': [], 'analysis_summary': {}}, - Mock() - ) - mock_filter_class.return_value = mock_filter - - mock_prompt_func.return_value = "unified prompt" - mock_cwd.return_value = Path('/tmp') - - with patch.dict(os.environ, { - 'GITHUB_REPOSITORY': 'owner/repo', - 'PR_NUMBER': '123', - 'GITHUB_TOKEN': 'test-token' - }): - with pytest.raises(SystemExit) as exc_info: - main() - - assert exc_info.value.code == 1 # Has HIGH finding - - captured = capsys.readouterr() - output = json.loads(captured.out) - - # Check that we have findings with correct review_type based on category - assert len(output['findings']) == 2 - - # Find the general finding (correctness category) and verify review_type - general_finding = next(f for f in output['findings'] if f['category'] == 'correctness') - assert general_finding['review_type'] == 'general' - - # Find the security finding and verify review_type - security_finding = next(f for f in output['findings'] if f['category'] == 'security') - assert security_finding['review_type'] == 'security' - - @patch('pathlib.Path.cwd') - @patch('claudecode.github_action_audit.get_unified_review_prompt') - @patch('claudecode.github_action_audit.FindingsFilter') - @patch('claudecode.github_action_audit.SimpleClaudeRunner') - @patch('claudecode.github_action_audit.GitHubActionClient') - def test_review_type_not_overwritten_if_already_set(self, mock_client_class, mock_runner_class, - mock_filter_class, mock_prompt_func, - mock_cwd, capsys): - """Test that review_type is not overwritten if finding already has it.""" - mock_client = Mock() - mock_client.get_pr_data.return_value = {'number': 123, 'title': 'Test', 'body': ''} - mock_client.get_pr_diff.return_value = "diff" - mock_client._is_excluded.return_value = False - mock_client.is_excluded.return_value = False - mock_client_class.return_value = mock_client - - # Finding already has review_type set (e.g., from a custom analyzer) - findings = [ - {'file': 'test.py', 'line': 1, 'severity': 'MEDIUM', 'category': 'correctness', - 'description': 'Issue', 'review_type': 'custom'} - ] - - mock_runner = Mock() - mock_runner.validate_claude_available.return_value = (True, "") - _mock_multiphase_success(mock_runner, findings=findings) - mock_runner_class.return_value = mock_runner - - mock_filter = Mock() - mock_filter.filter_findings.return_value = ( - True, - {'filtered_findings': findings, 'excluded_findings': [], 'analysis_summary': {}}, - Mock() - ) - mock_filter_class.return_value = mock_filter - - mock_prompt_func.return_value = "unified prompt" - mock_cwd.return_value = Path('/tmp') - - with patch.dict(os.environ, { - 'GITHUB_REPOSITORY': 'owner/repo', - 'PR_NUMBER': '123', - 'GITHUB_TOKEN': 'test-token' - }): - with pytest.raises(SystemExit): - main() - - captured = capsys.readouterr() - output = json.loads(captured.out) - - # The existing review_type should be preserved (setdefault doesn't overwrite) - assert output['findings'][0]['review_type'] == 'custom' + assert exc_info.value.code == 1 + output = json.loads(capsys.readouterr().out) + assert "Code review failed" in output["error"] + assert "Claude execution failed" in output["error"] diff --git a/claudecode/test_prompts.py b/claudecode/test_prompts.py index 2e9ed52..6a1404b 100644 --- a/claudecode/test_prompts.py +++ b/claudecode/test_prompts.py @@ -1,6 +1,14 @@ -"""Unit tests for the prompts module.""" +"""Unit tests for multi-phase prompt generation.""" -from claudecode.prompts import get_unified_review_prompt +from claudecode.prompts import ( + _build_hybrid_diff_section, + build_compliance_prompt, + build_context_discovery_prompt, + build_quality_prompt, + build_security_prompt, + build_triage_prompt, + build_validation_prompt, +) def _sample_pr_data(): @@ -12,192 +20,64 @@ def _sample_pr_data(): "changed_files": 1, "additions": 10, "deletions": 5, - "head": { - "repo": { - "full_name": "owner/repo" - } - }, - "files": [ - { - "filename": "app.py", - "status": "modified", - "additions": 10, - "deletions": 5, - } - ], + "head": {"repo": {"full_name": "owner/repo"}}, + "files": [{"filename": "app.py"}], } -class TestPrompts: - """Test unified prompt generation.""" - - def test_get_unified_review_prompt_basic(self): - pr_data = _sample_pr_data() - - pr_diff = """ -diff --git a/app.py b/app.py -@@ -1,5 +1,10 @@ - def process_input(user_input): -- return user_input -+ # Process the input -+ result = eval(user_input) -+ return result -""" - - prompt = get_unified_review_prompt(pr_data, pr_diff) - - assert isinstance(prompt, str) - assert len(prompt) > 0 - assert "123" in prompt - assert "Add new feature" in prompt - assert "testuser" in prompt - assert "app.py" in prompt - assert "eval(user_input)" in prompt - assert "code quality" in prompt.lower() - assert "security" in prompt.lower() - - def test_get_unified_review_prompt_without_diff_uses_file_reading_instructions(self): - pr_data = _sample_pr_data() - - prompt = get_unified_review_prompt(pr_data, pr_diff="diff --git a/a b/a", include_diff=False) - - assert "PR DIFF CONTENT:" not in prompt - assert "IMPORTANT - FILE READING INSTRUCTIONS:" in prompt - - def test_get_unified_review_prompt_no_files(self): - pr_data = _sample_pr_data() - pr_data["changed_files"] = 0 - pr_data["files"] = [] - - prompt = get_unified_review_prompt(pr_data, pr_diff="") - - assert isinstance(prompt, str) - assert "Files changed: 0" in prompt - - def test_get_unified_review_prompt_structure(self): - pr_data = _sample_pr_data() - pr_data["title"] = "Test PR" - - pr_diff = "diff --git a/test.py b/test.py\n+print('test')" - prompt = get_unified_review_prompt(pr_data, pr_diff) - - assert "CONTEXT:" in prompt - assert "OBJECTIVE:" in prompt - assert "REQUIRED OUTPUT FORMAT:" in prompt - assert pr_diff in prompt - - def test_get_unified_review_prompt_long_diff(self): - pr_data = { - "number": 12345, - "title": "Major refactoring", - "body": "Refactoring the entire codebase", - "user": "refactor-bot", - "changed_files": 10, - "additions": 1000, - "deletions": 500, - "head": { - "repo": { - "full_name": "owner/repo" - } - }, - "files": [ - { - "filename": f"file{i}.py", - "status": "modified", - "additions": 100, - "deletions": 50, - } - for i in range(10) - ], - } - - pr_diff = "\n".join([ - f"diff --git a/file{i}.py b/file{i}.py\n" + - "\n".join([f"+line {j}" for j in range(50)]) - for i in range(10) - ]) - - prompt = get_unified_review_prompt(pr_data, pr_diff) - - assert isinstance(prompt, str) - assert len(prompt) > 1000 - assert "12345" in prompt - assert "Major refactoring" in prompt - - def test_get_unified_review_prompt_unicode(self): - pr_data = { - "number": 666, - "title": "Add emoji support", - "body": "This PR adds emoji rendering", - "user": "emoji-user", - "changed_files": 1, - "additions": 42, - "deletions": 0, - "head": { - "repo": { - "full_name": "owner/repo" - } - }, - "files": [ - { - "filename": "emoji.py", - "status": "added", - "additions": 42, - "deletions": 0, - } - ], - } - - pr_diff = """ -diff --git a/emoji.py b/emoji.py -+# Security check -+def check_input(text: str) -> bool: -+ return "ALERT" not in text -""" - - prompt = get_unified_review_prompt(pr_data, pr_diff) - - assert "emoji-user" in prompt - assert "emoji.py" in prompt - assert "ALERT" in prompt - - def test_get_unified_review_prompt_custom_instructions(self): - pr_data = _sample_pr_data() - - prompt = get_unified_review_prompt( - pr_data, - pr_diff="diff --git a/app.py b/app.py", - custom_review_instructions="Check transaction consistency.", - custom_security_instructions="Check GraphQL authz.", - ) - - assert "Check transaction consistency." in prompt - assert "Check GraphQL authz." in prompt - - def test_get_unified_review_prompt_includes_pr_summary_schema(self): - """Test that prompt includes pr_summary in JSON schema.""" - pr_data = _sample_pr_data() - prompt = get_unified_review_prompt(pr_data, pr_diff="diff") - - assert '"pr_summary"' in prompt - assert '"overview"' in prompt - assert '"file_changes"' in prompt - assert '"label"' in prompt - assert '"files"' in prompt - - def test_get_unified_review_prompt_includes_summary_guidelines(self): - """Test that prompt includes PR summary guidelines.""" - pr_data = _sample_pr_data() - prompt = get_unified_review_prompt(pr_data, pr_diff="diff") - - assert "PR SUMMARY GUIDELINES:" in prompt - assert "2-4 sentences" in prompt - assert "~10 words" in prompt - - def test_build_hybrid_diff_section_max_lines_zero_omits_inline_diff(self): - from claudecode.prompts import _build_hybrid_diff_section - - section = _build_hybrid_diff_section("diff --git a/a.py b/a.py\n+print('x')", 0) - - assert "intentionally omitted" in section - assert "```diff" not in section +def test_build_hybrid_diff_section_max_lines_zero_omits_inline_diff(): + section = _build_hybrid_diff_section("diff --git a/a.py b/a.py\n+print('x')", 0) + + assert "intentionally omitted" in section + assert "```diff" not in section + + +def test_build_hybrid_diff_section_truncates_when_over_limit(): + diff = "\n".join([f"+line {i}" for i in range(20)]) + + section = _build_hybrid_diff_section(diff, 5) + + assert "TRUNCATED" in section + assert "5 lines out of 20" in section + + +def test_triage_prompt_contains_required_schema(): + prompt = build_triage_prompt(_sample_pr_data(), "diff --git a/app.py b/app.py", 100) + + assert '"skip_review"' in prompt + assert '"risk_level"' in prompt + + +def test_context_prompt_contains_discovery_fields(): + prompt = build_context_discovery_prompt(_sample_pr_data(), "diff --git a/app.py b/app.py", 100) + + assert '"claude_md_files"' in prompt + assert '"priority_files"' in prompt + + +def test_specialist_prompts_include_findings_schema_and_custom_instructions(): + pr_data = _sample_pr_data() + context = {"hotspots": ["app.py"]} + + compliance = build_compliance_prompt(pr_data, "diff", 100, context) + quality = build_quality_prompt(pr_data, "diff", 100, context, custom_review_instructions="Check tx safety") + security = build_security_prompt(pr_data, "diff", 100, context, custom_security_instructions="Check SSRF") + + for prompt in [compliance, quality, security]: + assert '"findings"' in prompt + assert '"confidence"' in prompt + + assert "rule_reference" in compliance + assert "Check tx safety" in quality + assert "exploit_preconditions" in security + assert "Check SSRF" in security + + +def test_validation_prompt_contains_candidate_findings(): + findings = [{"file": "app.py", "line": 10, "severity": "HIGH"}] + + prompt = build_validation_prompt(_sample_pr_data(), "diff", 100, findings) + + assert "CANDIDATE FINDINGS" in prompt + assert '"validated_findings"' in prompt + assert '"finding_index"' in prompt diff --git a/claudecode/test_workflow_integration.py b/claudecode/test_workflow_integration.py index 325d062..fa43290 100644 --- a/claudecode/test_workflow_integration.py +++ b/claudecode/test_workflow_integration.py @@ -288,8 +288,8 @@ def test_full_workflow_with_real_pr_structure(self, mock_get, mock_run): assert exc_info.value.code == 1 # Verify API calls - # 5 calls: PR data, files, diff, comments, reactions for bot comment - assert mock_get.call_count == 5 + # 3 calls: PR data, files, diff + assert mock_get.call_count == 3 assert mock_run.call_count == 7 # 1 version check + 6 phase calls # Verify the audit was run with proper prompt diff --git a/scripts/determine-claudecode-enablement.sh b/scripts/determine-claudecode-enablement.sh index a71ffc2..5a9865d 100755 --- a/scripts/determine-claudecode-enablement.sh +++ b/scripts/determine-claudecode-enablement.sh @@ -12,7 +12,6 @@ # - TRIGGER_ON_COMMIT: Whether to run on new commits (true/false) # - TRIGGER_ON_REVIEW_REQUEST: Whether to run on review requests (true/false) # - TRIGGER_ON_MENTION: Whether to run on bot mentions (true/false) -# - RUN_EVERY_COMMIT: Legacy flag for running on commits (true/false) # - SKIP_DRAFT_PRS: Whether to skip draft PRs (true/false) # - IS_DRAFT: Whether the PR is a draft (true/false) # - REQUIRE_LABEL: Required label name (optional) @@ -54,9 +53,8 @@ if [ "$ENABLE_CLAUDECODE" == "true" ]; then fi ;; commit) - # Check both new and legacy input names (run-every-commit is alias for trigger-on-commit) - if [ "${TRIGGER_ON_COMMIT:-false}" != "true" ] && [ "${RUN_EVERY_COMMIT:-false}" != "true" ]; then - echo "Trigger 'commit' is disabled via trigger-on-commit/run-every-commit input" + if [ "${TRIGGER_ON_COMMIT:-false}" != "true" ]; then + echo "Trigger 'commit' is disabled via trigger-on-commit input" ENABLE_CLAUDECODE="false" fi ;; @@ -121,4 +119,4 @@ if [ "$ENABLE_CLAUDECODE" == "true" ]; then echo "ClaudeCode is enabled for this run (trigger: $TRIGGER_TYPE, PR: #$PR_NUMBER, SHA: $GITHUB_SHA)" else echo "ClaudeCode is disabled for this run" -fi \ No newline at end of file +fi diff --git a/scripts/test-determine-claudecode-enablement.sh b/scripts/test-determine-claudecode-enablement.sh index 404e56c..7bc06e2 100755 --- a/scripts/test-determine-claudecode-enablement.sh +++ b/scripts/test-determine-claudecode-enablement.sh @@ -35,7 +35,7 @@ teardown_test() { # Unset all environment variables that tests might set unset GITHUB_EVENT_NAME PR_NUMBER GITHUB_SHA TRIGGER_TYPE unset TRIGGER_ON_OPEN TRIGGER_ON_COMMIT TRIGGER_ON_REVIEW_REQUEST TRIGGER_ON_MENTION - unset RUN_EVERY_COMMIT SKIP_DRAFT_PRS IS_DRAFT REQUIRE_LABEL PR_LABELS IS_PR + unset SKIP_DRAFT_PRS IS_DRAFT REQUIRE_LABEL PR_LABELS IS_PR } # Helper to run the script and capture output @@ -143,24 +143,6 @@ test_commit_trigger_with_new_input() { teardown_test } -test_commit_trigger_with_legacy_input() { - echo "Test: Commit trigger with run-every-commit (legacy) enabled" - setup_test - - export GITHUB_EVENT_NAME="pull_request" - export PR_NUMBER="123" - export GITHUB_SHA="abc123" - export TRIGGER_TYPE="commit" - export TRIGGER_ON_COMMIT="false" - export RUN_EVERY_COMMIT="true" - - run_script - - assert_equals "true" "$ENABLE_CLAUDECODE" "Should enable for legacy run-every-commit" - - teardown_test -} - test_commit_trigger_disabled() { echo "Test: Commit trigger disabled" setup_test @@ -170,7 +152,6 @@ test_commit_trigger_disabled() { export GITHUB_SHA="abc123" export TRIGGER_TYPE="commit" export TRIGGER_ON_COMMIT="false" - export RUN_EVERY_COMMIT="false" run_script @@ -453,7 +434,6 @@ test_unsupported_event_type test_pull_request_event_open_trigger_enabled test_pull_request_event_open_trigger_disabled test_commit_trigger_with_new_input -test_commit_trigger_with_legacy_input test_commit_trigger_disabled test_review_request_trigger test_mention_trigger @@ -484,4 +464,4 @@ if [ $TESTS_FAILED -eq 0 ]; then else echo -e "${RED}Some tests failed!${NC}" exit 1 -fi \ No newline at end of file +fi