diff --git a/Dockerfile b/Dockerfile index ef979a37c..6e6d24419 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,5 @@ # 使用官方的 Python 基础镜像 -FROM python:3.10-slim AS base +FROM python:3.14.0rc1-slim AS base # 设置工作目录 WORKDIR /app diff --git a/RAG_UPGRADE_README.md b/RAG_UPGRADE_README.md new file mode 100644 index 000000000..9f8172926 --- /dev/null +++ b/RAG_UPGRADE_README.md @@ -0,0 +1,478 @@ +# 🚀 RAG增强代码审查系统升级指南 + +## 📋 升级概述 + +现有的AI代码审查系统添加了RAG(Retrieval-Augmented Generation)功能。升级后的系统具备以下新特性: + +### 🆕 新增功能 + +1. **智能知识库管理** + + - 支持上传自定义技术文档(PDF、Word、Markdown、代码文件等) + - 内置7种编程语言的最佳实践文档(HTML、CSS、JavaScript、Java、Python、C++、Go) + - 文档自动分割和向量化存储 + - 基于ChromaDB的持久化向量存储 +2. **智能语言检测与检索** + + - 自动检测代码语言特征(Python、JavaScript、Java、Go、C++、HTML、CSS) + - 基于代码内容进行语义相似性搜索 + - 智能匹配相关技术文档和最佳实践 + - 支持相似度阈值过滤和结果数量限制 +3. **RAG增强审查** + + - 结合检索到的知识文档进行代码审查 + - 提供基于最佳实践的具体建议 + - 动态生成上下文感知的审查提示词 + - 支持多种审查风格(专业、讽刺、温和、幽默) +4. **可视化管理界面** + + - 知识库状态监控和文档统计 + - 文档上传、管理和搜索 + - RAG功能测试和批量审查 + - 结果对比和报告导出 +5. **高级功能** + + - 支持温度参数调节(0-2) + - 相似度阈值配置(0-1) + - 批量代码审查 + - 审查结果评分系统 + +## 🔧 部署步骤 + +### 1. 安装依赖 + +```bash +# 安装新增的RAG相关依赖 +pip install -r requirements.txt +``` + +**核心RAG依赖包**: + +- `chromadb==0.4.15` - 向量数据库 +- `sentence-transformers==2.2.2` - 文本向量化 +- `langchain==0.1.0` - 语言链框架 +- `PyPDF2==3.0.1` - PDF文档处理 +- `python-docx==0.8.11` - Word文档处理 +- `markdown==3.5.1` - Markdown处理 +- `beautifulsoup4==4.12.2` - HTML解析 +- `faiss-cpu==1.7.4` - 向量检索 + +### 2. 配置环境变量 + +在 `conf/.env` 文件中添加以下RAG相关配置: + +```bash +# RAG功能配置 +ENABLE_RAG=1 +# 1表示启用RAG,0表示使用原有审查方式 + +# 知识库配置 +KNOWLEDGE_BASE_PATH=data/knowledge_base +CHUNK_SIZE=1000 +CHUNK_OVERLAP=200 +SEARCH_RESULTS_LIMIT=5 + +# 相似度配置 +RAG_SIMILARITY_THRESHOLD=0.2 + +# 模型配置 +AUTO_INIT_BUILTIN_KNOWLEDGE=1 + +# 其他配置保持不变... +LLM_PROVIDER=deepseek +DEEPSEEK_API_KEY=your_api_key_here +GITLAB_ACCESS_TOKEN=your_token_here +``` + +### 3. 启动升级后的服务 + +#### 方式一:Docker部署(推荐) + +```bash +# 使用现有的docker-compose,会自动支持RAG功能 +docker-compose up -d +``` + +#### 方式二:本地部署 + +```bash +# 启动API服务(包含RAG功能) +python api.py + +# 启动原有Dashboard +streamlit run ui.py --server.port=5002 --server.address=0.0.0.0 + +# 启动RAG管理界面 +streamlit run rag_dashboard.py --server.port=5003 --server.address=0.0.0.0 +``` + +### 4. 验证部署 + +- API服务:http://localhost:5001 +- 原Dashboard:http://localhost:5002 +- RAG管理界面:http://localhost:5003 + +## 🎯 使用指南 + +### RAG管理界面功能 + +#### 1. 状态总览 📊 + +- 查看RAG功能启用状态 +- 监控知识库文档统计(自定义文档、内置文档) +- 检查系统配置和知识库路径 + +#### 2. 文档管理 📝 + +- 查看所有已上传的文档(自定义和内置) +- 管理自定义文档(删除、搜索) +- 恢复和重新加载内置文档 + +#### 3. 文档搜索 🔍 + +- 测试语义搜索功能 +- 支持按来源过滤(自定义、内置、全部) +- 查看检索结果和相似度分数 +- 验证知识库内容质量 + +#### 4. 上传文档 📁 + +- 支持多种文件格式:PDF、Word(docx)、Markdown、代码文件 +- 添加标题和标签便于分类 +- 自动处理和向量化存储 + +#### 5. RAG测试 🧪 + +- 输入代码片段测试审查功能 +- 支持预设示例代码(HTML、CSS、JavaScript、Java、Python、C++、Go) +- 可调节温度参数(0-2)和相似度阈值(0-1) +- 查看检索到的相关文档 +- 对比RAG前后的审查效果 + +#### 6. 批量审查 📦 + +- 支持多个代码文件批量审查 +- 可选择RAG测试或RAG/普通对比模式 +- 生成批量审查报告 + +### API接口 + +新增的知识库管理API: + +```bash +# 获取知识库状态 +GET /api/knowledge/status +Response: { + "rag_enabled": true, + "total_documents": 15, + "custom_documents": 3, + "builtin_documents": 12, + "knowledge_base_path": "data/knowledge_base" +} + +# 上传文档 +POST /api/knowledge/upload +Content-Type: multipart/form-data +Body: { + "file": , + "title": "文档标题", + "tags": "tag1,tag2,tag3" +} + +# 列出文档 +GET /api/knowledge/documents +Response: { + "documents": [ + { + "id": "doc_id", + "title": "文档标题", + "tags": ["tag1", "tag2"], + "source": "custom", + "created_at": "2024-01-01T00:00:00Z" + } + ], + "total": 15 +} + +# 删除文档 +DELETE /api/knowledge/documents/{doc_id}?source=custom + +# 搜索文档 +POST /api/knowledge/search +Body: { + "query": "搜索关键词", + "n_results": 5, + "source": "all", + "similarity_threshold": 0.2 +} + +# 测试RAG +POST /api/knowledge/test_rag +Body: { + "code": "代码内容", + "commit_message": "提交信息", + "temperature": 0.3, + "similarity_threshold": 0.2 +} +Response: { + "code": "代码内容", + "commit_message": "提交信息", + "similarity_threshold": 0.2, + "temperature": 0.3, + "relevant_docs": "相关文档内容", + "review_result": "审查结果", + "score": 85 +} + +# 对比RAG和普通审查 +POST /api/knowledge/compare_rag +Body: { + "code": "代码内容", + "commit_message": "提交信息", + "temperature": 0.3, + "similarity_threshold": 0.2 +} +Response: { + "rag_result": { + "relevant_docs": "相关文档", + "review_result": "RAG审查结果", + "score": 85 + }, + "normal_result": { + "review_result": "普通审查结果", + "score": 75 + } +} +``` + +## 🔄 工作流程对比 + +### 原有流程 + +``` +Webhook触发 → 获取代码变更 → 调用LLM → 返回审查结果 +``` + +### 升级后RAG流程 + +``` +Webhook触发 → 获取代码变更 → 语言特征检测 → 知识库检索 → RAG增强审查 → 返回基于最佳实践的审查结果 +``` + +### 详细RAG流程 + +1. **代码接收**: 接收代码变更和提交信息 +2. **语言检测**: 自动检测代码语言(Python、JavaScript、Java、Go、C++、HTML、CSS) +3. **特征提取**: 提取代码中的关键字、库引用等特征 +4. **知识检索**: 在知识库中检索相关技术文档 +5. **相似度过滤**: 根据阈值过滤相关文档 +6. **RAG审查**: 结合检索到的文档进行代码审查 +7. **结果生成**: 生成基于最佳实践的审查报告 + +## 📚 内置知识库 + +系统预置了以下编程语言规范文档: + +1. **HTML编码规范** (`html_standards.md`) + + - 语义化标签使用指南 + - 可访问性最佳实践 + - SEO优化建议 +2. **CSS编码规范** (`css_standards.md`) + + - BEM命名规范 + - 响应式设计原则 + - 性能优化指南 +3. **JavaScript编码规范** (`javascript_standards.md`) + + - ES6+特性使用建议 + - 函数式编程最佳实践 + - 异步编程规范 +4. **Java编码规范** (`java_standards.md`) + + - SOLID原则应用 + - 并发编程最佳实践 + - 性能优化指南 +5. **Python编码规范** (`python_standards.md`) + + - PEP 8代码风格 + - 类型注解使用 + - 最佳实践建议 +6. **C++编码规范** (`cpp_standards.md`) + + - 内存管理准则 + - RAII原则应用 + - 性能优化指南 +7. **Go编码规范** (`go_standards.md`) + + - 并发安全最佳实践 + - 错误处理规范 + - 接口设计原则 + +## 🎨 自定义知识库 + +### 添加企业规范文档 + +1. 准备文档(支持TXT、PDF、Word(docx)、Markdown格式) +2. 通过RAG管理界面上传 +3. 添加相关标签便于检索 +4. 系统自动处理和向量化 + +### 文档组织建议 + +- **按技术栈分类**:React、Vue、Java、Python等 +- **按类型分类**:编码规范、最佳实践、架构设计等 +- **按团队分类**:前端团队、后端团队、移动端团队等 + +### 支持的文档格式 + +- **PDF文档**: 自动提取文本内容 +- **Word文档**: 支持.docx格式 +- **Markdown**: 保持格式结构 +- **代码文件**: 支持.py、.js、.java、.cpp、.go等 +- **文本文件**: 纯文本格式 + +## ⚙️ 高级配置 + +### 知识库配置 + +```bash +# 知识库路径 +KNOWLEDGE_BASE_PATH=data/knowledge_base + +# 文本分割参数 +CHUNK_SIZE=1000 # 分块大小 +CHUNK_OVERLAP=200 # 重叠大小 + +# 检索参数 +SEARCH_RESULTS_LIMIT=5 # 检索结果数量 + +# RAG功能开关 +ENABLE_RAG=1 # 1启用,0禁用 + +# 相似度阈值 +RAG_SIMILARITY_THRESHOLD=0.2 + +# 向量模型配置 +MODEL_PATH=model/all-MiniLM-L6-v2 # 本地模型路径 +``` + +### 审查风格配置 + +```bash +REVIEW_STYLE=professional # professional, sarcastic, gentle, humorous +``` + +### 温度参数说明 + +- **0.0-0.3**: 保守、一致的输出 +- **0.3-0.7**: 平衡的创造性和一致性 +- **0.7-1.0**: 更创造性的输出 +- **1.0-2.0**: 高度创造性的输出 + +## 🐛 故障排除 + +### 常见问题 + +1. **RAG功能不生效** + + - 检查 `ENABLE_RAG=1` 配置 + - 确认依赖包安装完整 + - 查看日志确认知识库初始化状态 +2. **知识库为空** + + - 系统首次启动会自动初始化内置文档 + - 检查 `data/knowledge_base` 目录权限 + - 查看启动日志确认初始化过程 +3. **文档上传失败** + + - 检查文件格式是否支持 + - 确认 `data/uploads` 目录可写 + - 查看具体错误信息 +4. **搜索结果为空** + + - 确认知识库中有相关文档 + - 尝试不同的搜索关键词 + - 检查向量化是否正常 +5. **模型加载失败** + + - 检查 `model/all-MiniLM-L6-v2/` 目录 + - 确认模型文件完整性 + - 尝试重新下载模型 + +### 日志调试 + +```bash +# 查看详细日志 +tail -f log/app.log +``` + +## 📈 性能优化 + +1. **向量模型选择** + + - 默认使用 `all-MiniLM-L6-v2`(轻量级,768维向量) + - 支持本地模型和在线模型自动切换 + - 可升级到更大的模型获得更好效果 +2. **文本分块优化** + + - 智能分块策略,支持在句子边界分割 + - 默认块大小1000字符,重叠200字符 + - 自动处理中英文混合文本 +3. **检索优化** + + - 使用ChromaDB持久化存储 + - 支持标签过滤和元数据检索 + - 结果按相似度排序 + - 支持自定义检索数量限制 +4. **语言检测优化** + + - 基于关键字和库引用的智能语言检测 + - 支持7种主流编程语言 + - 提高检索精度和效率 + +## 🔄 版本兼容性 + +- **向后兼容**:原有功能完全保留 +- **可选启用**:通过 `ENABLE_RAG` 控制 +- **平滑升级**:无需修改现有配置 +- **渐进式部署**:可以逐步启用RAG功能 + +## 📊 功能对比 + +| 功能特性 | 原有系统 | RAG增强系统 | +| -------- | --------------- | --------------------- | +| 代码审查 | ✅ 基础审查 | ✅ 基于最佳实践的审查 | +| 知识库 | ❌ 无 | ✅ 智能知识库 | +| 语言检测 | ❌ 无 | ✅ 自动语言检测 | +| 文档检索 | ❌ 无 | ✅ 语义相似性检索 | +| 审查风格 | ✅ 多种风格可选 | ✅ 多种风格可选 | +| 温度调节 | ❌ 无 | ✅ 可调节创造性 | +| 批量审查 | ❌ 无 | ✅ 支持批量处理 | +| 结果对比 | ❌ 无 | ✅ RAG vs 普通对比 | + +## 🔮 未来扩展 + +1. **更多语言支持** + + - 扩展语言检测能力 + - 添加更多编程语言规范 +2. **高级检索功能** + + - 混合检索策略 + - 语义过滤功能 + - 多模态检索 +3. **个性化定制** + + - 团队专属知识库 + - 个性化审查风格 + - 自定义评分标准 +4. **集成增强** + + - IDE插件支持 + - CI/CD集成 + - 团队协作功能 + +--- + +*本文档描述了RAG代码审查系统的完整功能和使用方法,帮助用户快速上手和深度使用该系统。* diff --git a/biz/api/knowledge_api.py b/biz/api/knowledge_api.py new file mode 100644 index 000000000..84918f9ed --- /dev/null +++ b/biz/api/knowledge_api.py @@ -0,0 +1,293 @@ +import os +import traceback +from flask import Blueprint, request, jsonify +from werkzeug.utils import secure_filename + +from biz.utils.rag_code_reviewer import RAGCodeReviewer +from biz.utils.code_reviewer import CodeReviewer +from biz.utils.log import logger + +knowledge_bp = Blueprint('knowledge', __name__) + +# 允许的文件扩展名 +ALLOWED_EXTENSIONS = {'txt', 'md'} + +def allowed_file(filename): + return '.' in filename and \ + filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS + +@knowledge_bp.route('/upload', methods=['POST']) +def upload_document(): + """上传知识文档""" + try: + if 'file' not in request.files: + return jsonify({'error': '没有文件'}), 400 + + file = request.files['file'] + if file.filename == '': + return jsonify({'error': '没有选择文件'}), 400 + + if not allowed_file(file.filename): + return jsonify({'error': f'不支持的文件类型。请上传 .txt 或 .md 格式的文档文件。'}), 400 + + # 获取标题和标签 + title = request.form.get('title', file.filename) + tags = request.form.get('tags', '').split(',') + tags = [tag.strip() for tag in tags if tag.strip()] + + # 保存文件 + filename = secure_filename(file.filename) + upload_folder = 'data/uploads' + os.makedirs(upload_folder, exist_ok=True) + file_path = os.path.join(upload_folder, filename) + file.save(file_path) + + # 添加到知识库 + reviewer = RAGCodeReviewer() + doc_id = reviewer.add_knowledge_document(title, file_path, tags) + + # 删除临时文件 + os.remove(file_path) + + return jsonify({ + 'message': '文档上传成功', + 'doc_id': doc_id, + 'title': title, + 'tags': tags + }) + + except Exception as e: + logger.error(f"上传文档失败: {e}") + logger.error(traceback.format_exc()) + return jsonify({'error': f'上传失败: {str(e)}'}), 500 + +@knowledge_bp.route('/documents', methods=['GET']) +def list_documents(): + """列出所有知识文档""" + try: + reviewer = RAGCodeReviewer() + documents = reviewer.list_knowledge_documents() + + return jsonify({ + 'documents': documents, + 'total': len(documents) + }) + + except Exception as e: + logger.error(f"获取文档列表失败: {e}") + return jsonify({'error': f'获取失败: {str(e)}'}), 500 + +@knowledge_bp.route('/documents/', methods=['DELETE']) +def delete_document(doc_id): + """删除知识文档""" + try: + source = request.args.get('source', 'custom') # 获取source参数 + reviewer = RAGCodeReviewer() + reviewer.delete_knowledge_document(doc_id, source) + + return jsonify({'message': f'文档 {doc_id} 已删除'}) + + except Exception as e: + logger.error(f"删除文档失败: {e}") + return jsonify({'error': f'删除失败: {str(e)}'}), 500 + +@knowledge_bp.route('/documents/restore', methods=['POST']) +def restore_builtin_documents(): + """恢复所有内置文档""" + try: + reviewer = RAGCodeReviewer() + reviewer.restore_builtin_documents() + + return jsonify({'message': '内置文档已恢复'}) + + except Exception as e: + logger.error(f"恢复内置文档失败: {e}") + return jsonify({'error': f'恢复失败: {str(e)}'}), 500 + +@knowledge_bp.route('/documents/reload', methods=['POST']) +def reload_builtin_documents(): + """重新加载内置文档(清除后重新添加)""" + try: + reviewer = RAGCodeReviewer() + + # 清除内置文档集合 + reviewer.knowledge_base.clear_builtin_collection() + + # 重新初始化内置文档 + reviewer.knowledge_base._init_builtin_knowledge() + + return jsonify({'message': '内置文档已重新加载'}) + + except Exception as e: + logger.error(f"重新加载内置文档失败: {e}") + return jsonify({'error': f'重新加载失败: {str(e)}'}), 500 + +@knowledge_bp.route('/search', methods=['POST']) +def search_documents(): + """搜索相关文档""" + try: + data = request.get_json() + if not data or 'query' not in data: + return jsonify({'error': '缺少查询参数'}), 400 + + query = data['query'] + n_results = data.get('n_results', 5) + source = data.get('source', 'all') # all, custom, builtin + similarity_threshold = float(data.get('similarity_threshold', 0.0)) # 新增相似度阈值参数 + + # 验证相似度阈值范围 + if not 0 <= similarity_threshold <= 1: + return jsonify({'error': '相似度阈值必须在0到1之间'}), 400 + + reviewer = RAGCodeReviewer() + results = reviewer.knowledge_base.search_relevant_documents( + query, n_results, source, similarity_threshold + ) + + # 确保所有返回的结果都满足相似度阈值要求 + filtered_results = [r for r in results if r['score'] >= similarity_threshold] + + return jsonify({ + 'query': query, + 'results': filtered_results, + 'total': len(filtered_results), + 'similarity_threshold': similarity_threshold + }) + + except ValueError as e: + return jsonify({'error': f'参数错误: {str(e)}'}), 400 + except Exception as e: + logger.error(f"搜索文档失败: {e}") + return jsonify({'error': f'搜索失败: {str(e)}'}), 500 + +@knowledge_bp.route('/test_rag', methods=['POST']) +def test_rag(): + """测试RAG功能""" + try: + data = request.get_json() + if not data or 'code' not in data: + return jsonify({'error': '缺少代码参数'}), 400 + + code = data['code'] + commit_message = data.get('commit_message', '') + similarity_threshold = float(data.get('similarity_threshold', 0.2)) # 新增相似度阈值参数 + temperature = float(data.get('temperature', 0.3)) # 新增温度参数 + + # 验证相似度阈值范围 + if not 0 <= similarity_threshold <= 1: + return jsonify({'error': '相似度阈值必须在0到1之间'}), 400 + + # 验证温度范围 + if not 0 <= temperature <= 2: + return jsonify({'error': '温度值必须在0到2之间'}), 400 + + reviewer = RAGCodeReviewer() + + # 获取相关知识 + relevant_docs = reviewer.get_relevant_knowledge(code, similarity_threshold) + + # 进行审查 + review_result = reviewer.review_and_strip_code(code, commit_message, similarity_threshold, temperature) + score = reviewer.parse_review_score(review_result) + + return jsonify({ + 'code': code, + 'commit_message': commit_message, + 'similarity_threshold': similarity_threshold, + 'temperature': temperature, + 'relevant_docs': relevant_docs, + 'review_result': review_result, + 'score': score + }) + + except Exception as e: + logger.error(f"RAG测试失败: {e}") + return jsonify({'error': f'测试失败: {str(e)}'}), 500 + +@knowledge_bp.route('/status', methods=['GET']) +def get_status(): + """获取知识库状态""" + try: + reviewer = RAGCodeReviewer() + documents = reviewer.list_knowledge_documents() + + custom_docs = [doc for doc in documents if doc['source'] == 'custom'] + builtin_docs = [doc for doc in documents if doc['source'] == 'builtin'] + + return jsonify({ + 'rag_enabled': reviewer.enable_rag, + 'total_documents': len(documents), + 'custom_documents': len(custom_docs), + 'builtin_documents': len(builtin_docs), + 'knowledge_base_path': reviewer.knowledge_base.db_path + }) + + except Exception as e: + logger.error(f"获取状态失败: {e}") + return jsonify({'error': f'获取状态失败: {str(e)}'}), 500 + +@knowledge_bp.route('/compare_rag', methods=['POST']) +def compare_rag(): + """对比测试RAG和非RAG的代码审查结果""" + try: + data = request.get_json() + if not data or 'code' not in data: + return jsonify({'error': '缺少代码参数'}), 400 + + code = data['code'] + commit_message = data.get('commit_message', '') + similarity_threshold = float(data.get('similarity_threshold', 0.2)) # 新增相似度阈值参数 + temperature = float(data.get('temperature', 0.3)) # 新增温度参数 + + # 验证相似度阈值范围 + if not 0 <= similarity_threshold <= 1: + return jsonify({'error': '相似度阈值必须在0到1之间'}), 400 + + # 验证温度范围 + if not 0 <= temperature <= 2: + return jsonify({'error': '温度值必须在0到2之间'}), 400 + + # 1. 使用RAG进行审查 + rag_reviewer = RAGCodeReviewer() + + # 获取相关知识 + relevant_docs = rag_reviewer.get_relevant_knowledge(code, similarity_threshold) + + # RAG审查 + rag_review_result = rag_reviewer.review_and_strip_code(code, commit_message, similarity_threshold, temperature) + rag_score = rag_reviewer.parse_review_score(rag_review_result) + + # 2. 使用普通模型进行审查(不使用RAG) + normal_reviewer = CodeReviewer() + normal_review_result = normal_reviewer.review_and_strip_code(code, commit_message, temperature) + normal_score = normal_reviewer.parse_review_score(normal_review_result) + + # 计算实际显示的文档数量 + docs = relevant_docs.split('###') if relevant_docs else [] + actual_docs = [doc for doc in docs if doc.strip()] + + return jsonify({ + 'code': code, + 'commit_message': commit_message, + 'similarity_threshold': similarity_threshold, + 'temperature': temperature, + 'rag_result': { + 'relevant_docs': relevant_docs, + 'review_result': rag_review_result, + 'score': rag_score + }, + 'normal_result': { + 'review_result': normal_review_result, + 'score': normal_score + }, + 'comparison': { + 'score_difference': rag_score - normal_score, + 'has_relevant_docs': bool(relevant_docs.strip()), + 'unique_docs_count': len(actual_docs), # 使用实际显示的文档数 + 'chunks_count': len(actual_docs) # 保持一致性 + } + }) + + except Exception as e: + logger.error(f"对比测试失败: {e}") + return jsonify({'error': f'对比测试失败: {str(e)}'}), 500 diff --git a/biz/llm/client/base.py b/biz/llm/client/base.py index b83c36e92..e2978a412 100644 --- a/biz/llm/client/base.py +++ b/biz/llm/client/base.py @@ -1,5 +1,6 @@ from abc import abstractmethod from typing import List, Dict, Optional +import os from biz.llm.types import NotGiven, NOT_GIVEN from biz.utils.log import logger @@ -8,6 +9,10 @@ class BaseClient: """ Base class for chat models client. """ + def __init__(self): + # 从环境变量获取默认温度设置 + self.default_temperature = float(os.getenv("LLM_TEMPERATURE", "0.3")) + def ping(self) -> bool: """Ping the model to check connectivity.""" try: @@ -21,6 +26,12 @@ def ping(self) -> bool: def completions(self, messages: List[Dict[str, str]], model: Optional[str] | NotGiven = NOT_GIVEN, + temperature: Optional[float] | NotGiven = NOT_GIVEN, ) -> str: """Chat with the model. + + Args: + messages: List of message dictionaries with 'role' and 'content' + model: Model name to use + temperature: Controls randomness in the response (0.0 to 2.0) """ diff --git a/biz/llm/client/deepseek.py b/biz/llm/client/deepseek.py index 9cd63b5d7..43efd7d4e 100644 --- a/biz/llm/client/deepseek.py +++ b/biz/llm/client/deepseek.py @@ -10,6 +10,7 @@ class DeepSeekClient(BaseClient): def __init__(self, api_key: str = None): + super().__init__() # 调用父类初始化 self.api_key = api_key or os.getenv("DEEPSEEK_API_KEY") self.base_url = os.getenv("DEEPSEEK_API_BASE_URL", "https://api.deepseek.com") if not self.api_key: @@ -21,14 +22,25 @@ def __init__(self, api_key: str = None): def completions(self, messages: List[Dict[str, str]], model: Optional[str] | NotGiven = NOT_GIVEN, + temperature: Optional[float] | NotGiven = NOT_GIVEN, ) -> str: try: model = model or self.default_model - logger.debug(f"Sending request to DeepSeek API. Model: {model}, Messages: {messages}") + temperature = temperature if temperature is not NOT_GIVEN else self.default_temperature + + # 处理None值,使用默认温度 + if temperature is None: + temperature = self.default_temperature + + # 确保温度值在有效范围内 + temperature = max(0.0, min(2.0, temperature)) + + logger.debug(f"Sending request to DeepSeek API. Model: {model}, Temperature: {temperature}, Messages: {messages}") completion = self.client.chat.completions.create( model=model, - messages=messages + messages=messages, + temperature=temperature ) if not completion or not completion.choices: diff --git a/biz/llm/client/ollama_client.py b/biz/llm/client/ollama_client.py index 1574f9ac5..a70374dd3 100644 --- a/biz/llm/client/ollama_client.py +++ b/biz/llm/client/ollama_client.py @@ -11,6 +11,7 @@ class OllamaClient(BaseClient): def __init__(self, api_key: str = None): + super().__init__() # 调用父类初始化 self.default_model = self.default_model = os.getenv("OLLAMA_API_MODEL", "deepseek-r1-8k:14b") self.base_url = os.getenv("OLLAMA_API_BASE_URL", "http://127.0.0.1:11434") self.client = Client( @@ -39,7 +40,22 @@ def _extract_content(self, content: str) -> str: def completions(self, messages: List[Dict[str, str]], model: Optional[str] | NotGiven = NOT_GIVEN, + temperature: Optional[float] | NotGiven = NOT_GIVEN, ) -> str: - response: ChatResponse = self.client.chat(model or self.default_model, messages) + model = model or self.default_model + temperature = temperature if temperature is not NOT_GIVEN else self.default_temperature + + # 处理None值,使用默认温度 + if temperature is None: + temperature = self.default_temperature + + # 确保温度值在有效范围内 + temperature = max(0.0, min(2.0, temperature)) + + response: ChatResponse = self.client.chat( + model=model, + messages=messages, + options={"temperature": temperature} + ) content = response['message']['content'] return self._extract_content(content) diff --git a/biz/llm/client/openai.py b/biz/llm/client/openai.py index 69d35f172..73b4284ce 100644 --- a/biz/llm/client/openai.py +++ b/biz/llm/client/openai.py @@ -9,6 +9,7 @@ class OpenAIClient(BaseClient): def __init__(self, api_key: str = None): + super().__init__() # 调用父类初始化 self.api_key = api_key or os.getenv("OPENAI_API_KEY") self.base_url = os.getenv("OPENAI_API_BASE_URL", "https://api.openai.com") if not self.api_key: @@ -20,10 +21,21 @@ def __init__(self, api_key: str = None): def completions(self, messages: List[Dict[str, str]], model: Optional[str] | NotGiven = NOT_GIVEN, + temperature: Optional[float] | NotGiven = NOT_GIVEN, ) -> str: model = model or self.default_model + temperature = temperature if temperature is not NOT_GIVEN else self.default_temperature + + # 处理None值,使用默认温度 + if temperature is None: + temperature = self.default_temperature + + # 确保温度值在有效范围内 + temperature = max(0.0, min(2.0, temperature)) + completion = self.client.chat.completions.create( model=model, messages=messages, + temperature=temperature, ) return completion.choices[0].message.content diff --git a/biz/llm/client/qwen.py b/biz/llm/client/qwen.py index 14e03a9dd..e662b0e0e 100644 --- a/biz/llm/client/qwen.py +++ b/biz/llm/client/qwen.py @@ -9,6 +9,7 @@ class QwenClient(BaseClient): def __init__(self, api_key: str = None): + super().__init__() # 调用父类初始化 self.api_key = api_key or os.getenv("QWEN_API_KEY") self.base_url = os.getenv("QWEN_API_BASE_URL", "https://dashscope.aliyuncs.com/compatible-mode/v1") if not self.api_key: @@ -21,11 +22,22 @@ def __init__(self, api_key: str = None): def completions(self, messages: List[Dict[str, str]], model: Optional[str] | NotGiven = NOT_GIVEN, + temperature: Optional[float] | NotGiven = NOT_GIVEN, ) -> str: model = model or self.default_model + temperature = temperature if temperature is not NOT_GIVEN else self.default_temperature + + # 处理None值,使用默认温度 + if temperature is None: + temperature = self.default_temperature + + # 确保温度值在有效范围内 + temperature = max(0.0, min(2.0, temperature)) + completion = self.client.chat.completions.create( model=model, messages=messages, + temperature=temperature, extra_body=self.extra_body, ) return completion.choices[0].message.content diff --git a/biz/llm/client/zhipuai.py b/biz/llm/client/zhipuai.py index 0790cd97f..ff6680e92 100644 --- a/biz/llm/client/zhipuai.py +++ b/biz/llm/client/zhipuai.py @@ -9,6 +9,7 @@ class ZhipuAIClient(BaseClient): def __init__(self, api_key: str = None): + super().__init__() # 调用父类初始化 self.api_key = api_key or os.getenv("ZHIPUAI_API_KEY") if not self.api_key: raise ValueError("API key is required. Please provide it or set it in the environment variables.") @@ -19,10 +20,21 @@ def __init__(self, api_key: str = None): def completions(self, messages: List[Dict[str, str]], model: Optional[str] | NotGiven = NOT_GIVEN, + temperature: Optional[float] | NotGiven = NOT_GIVEN, ) -> str: model = model or self.default_model + temperature = temperature if temperature is not NOT_GIVEN else self.default_temperature + + # 处理None值,使用默认温度 + if temperature is None: + temperature = self.default_temperature + + # 确保温度值在有效范围内 + temperature = max(0.0, min(2.0, temperature)) + completion = self.client.chat.completions.create( model=model, messages=messages, + temperature=temperature, ) return completion.choices[0].message.content diff --git a/biz/queue/worker.py b/biz/queue/worker.py index 169648e52..071621301 100644 --- a/biz/queue/worker.py +++ b/biz/queue/worker.py @@ -7,6 +7,7 @@ from biz.gitlab.webhook_handler import filter_changes, MergeRequestHandler, PushHandler from biz.github.webhook_handler import filter_changes as filter_github_changes, PullRequestHandler as GithubPullRequestHandler, PushHandler as GithubPushHandler from biz.utils.code_reviewer import CodeReviewer +from biz.utils.rag_code_reviewer import RAGCodeReviewer from biz.utils.im import notifier from biz.utils.log import logger @@ -19,45 +20,55 @@ def handle_push_event(webhook_data: dict, gitlab_token: str, gitlab_url: str, gi logger.info('Push Hook event received') commits = handler.get_push_commits() if not commits: - logger.error('Failed to get commits') + logger.info('No commits found in push event (likely branch creation/deletion)') return review_result = None score = 0 - additions = 0 - deletions = 0 + should_record = False # 是否应该记录到数据库 + if push_review_enabled: # 获取PUSH的changes changes = handler.get_push_changes() logger.info('changes: %s', changes) changes = filter_changes(changes) + if not changes: logger.info('未检测到PUSH代码的修改,修改文件可能不满足SUPPORTED_EXTENSIONS。') - review_result = "关注的文件没有修改" - - if len(changes) > 0: + review_result = "关注的文件没有修改" + # 如果没有代码变更,不记录到数据库 + should_record = False + else: + should_record = True # 有代码变更才记录 commits_text = ';'.join(commit.get('message', '').strip() for commit in commits) - review_result = CodeReviewer().review_and_strip_code(str(changes), commits_text) - score = CodeReviewer.parse_review_score(review_text=review_result) - for item in changes: - additions += item['additions'] - deletions += item['deletions'] - # 将review结果提交到Gitlab的 notes - handler.add_push_notes(f'Auto Review Result: \n{review_result}') + # 使用RAG增强的代码审查器 + enable_rag = os.environ.get('ENABLE_RAG', '1') == '1' + if enable_rag: + reviewer = RAGCodeReviewer() + review_result = reviewer.review_and_strip_code(str(changes), commits_text) + score = reviewer.parse_review_score(review_text=review_result) + else: + review_result = CodeReviewer().review_and_strip_code(str(changes), commits_text) + score = CodeReviewer.parse_review_score(review_text=review_result) + # 将review结果提交到Gitlab的 notes + handler.add_push_notes(f'Auto Review Result: \n{review_result}') - event_manager['push_reviewed'].send(PushReviewEntity( - project_name=webhook_data['project']['name'], - author=webhook_data['user_username'], - branch=webhook_data['project']['default_branch'], - updated_at=int(datetime.now().timestamp()), # 当前时间 - commits=commits, - score=score, - review_result=review_result, - url_slug=gitlab_url_slug, - webhook_data=webhook_data, - additions=additions, - deletions=deletions, - )) + # 只有在有代码变更时才记录到数据库 + if should_record: + # 获取第一个commit的URL作为推送记录的URL + push_url = commits[0].get('url', '') if commits else '' + + event_manager['push_reviewed'].send(PushReviewEntity( + project_name=webhook_data['project']['name'], + author=webhook_data['user_username'], + branch=webhook_data['project']['default_branch'], + updated_at=int(datetime.now().timestamp()), # 当前时间 + commits=commits, + score=score, + review_result=review_result, + url_slug=gitlab_url_slug, + url=push_url, + )) except Exception as e: error_message = f'服务出现未知错误: {str(e)}\n{traceback.format_exc()}' @@ -74,15 +85,10 @@ def handle_merge_request_event(webhook_data: dict, gitlab_token: str, gitlab_url :param gitlab_url_slug: :return: ''' - merge_review_only_protected_branches = os.environ.get('MERGE_REVIEW_ONLY_PROTECTED_BRANCHES_ENABLED', '0') == '1' try: # 解析Webhook数据 handler = MergeRequestHandler(webhook_data, gitlab_token, gitlab_url) logger.info('Merge Request Hook event received') - # 如果开启了仅review projected branches的,判断当前目标分支是否为projected branches - if merge_review_only_protected_branches and not handler.target_branch_protected(): - logger.info("Merge Request target branch not match protected branches, ignored.") - return if handler.action not in ['open', 'update']: logger.info(f"Merge Request Hook event, action={handler.action}, ignored.") @@ -96,12 +102,6 @@ def handle_merge_request_event(webhook_data: dict, gitlab_token: str, gitlab_url if not changes: logger.info('未检测到有关代码的修改,修改文件可能不满足SUPPORTED_EXTENSIONS。') return - # 统计本次新增、删除的代码总数 - additions = 0 - deletions = 0 - for item in changes: - additions += item.get('additions', 0) - deletions += item.get('deletions', 0) # 获取Merge Request的commits commits = handler.get_merge_request_commits() @@ -110,8 +110,16 @@ def handle_merge_request_event(webhook_data: dict, gitlab_token: str, gitlab_url return # review 代码 - commits_text = ';'.join(commit['title'] for commit in commits) - review_result = CodeReviewer().review_and_strip_code(str(changes), commits_text) + commits_text = ';'.join(commit['message'] for commit in commits) + # 使用RAG增强的代码审查器 + enable_rag = os.environ.get('ENABLE_RAG', '1') == '1' + if enable_rag: + reviewer = RAGCodeReviewer() + review_result = reviewer.review_and_strip_code(str(changes), commits_text) + score = reviewer.parse_review_score(review_text=review_result) + else: + review_result = CodeReviewer().review_and_strip_code(str(changes), commits_text) + score = CodeReviewer.parse_review_score(review_text=review_result) # 将review结果提交到Gitlab的 notes handler.add_merge_request_notes(f'Auto Review Result: \n{review_result}') @@ -125,13 +133,10 @@ def handle_merge_request_event(webhook_data: dict, gitlab_token: str, gitlab_url target_branch=webhook_data['object_attributes']['target_branch'], updated_at=int(datetime.now().timestamp()), commits=commits, - score=CodeReviewer.parse_review_score(review_text=review_result), + score=score, url=webhook_data['object_attributes']['url'], review_result=review_result, url_slug=gitlab_url_slug, - webhook_data=webhook_data, - additions=additions, - deletions=deletions, ) ) @@ -152,8 +157,6 @@ def handle_github_push_event(webhook_data: dict, github_token: str, github_url: review_result = None score = 0 - additions = 0 - deletions = 0 if push_review_enabled: # 获取PUSH的changes changes = handler.get_push_changes() @@ -165,14 +168,21 @@ def handle_github_push_event(webhook_data: dict, github_token: str, github_url: if len(changes) > 0: commits_text = ';'.join(commit.get('message', '').strip() for commit in commits) - review_result = CodeReviewer().review_and_strip_code(str(changes), commits_text) - score = CodeReviewer.parse_review_score(review_text=review_result) - for item in changes: - additions += item.get('additions', 0) - deletions += item.get('deletions', 0) + # 使用RAG增强的代码审查器 + enable_rag = os.environ.get('ENABLE_RAG', '1') == '1' + if enable_rag: + reviewer = RAGCodeReviewer() + review_result = reviewer.review_and_strip_code(str(changes), commits_text) + score = reviewer.parse_review_score(review_text=review_result) + else: + review_result = CodeReviewer().review_and_strip_code(str(changes), commits_text) + score = CodeReviewer.parse_review_score(review_text=review_result) # 将review结果提交到GitHub的 notes handler.add_push_notes(f'Auto Review Result: \n{review_result}') + # 获取第一个commit的URL作为推送记录的URL + push_url = commits[0].get('url', '') if commits else '' + event_manager['push_reviewed'].send(PushReviewEntity( project_name=webhook_data['repository']['name'], author=webhook_data['sender']['login'], @@ -182,9 +192,7 @@ def handle_github_push_event(webhook_data: dict, github_token: str, github_url: score=score, review_result=review_result, url_slug=github_url_slug, - webhook_data=webhook_data, - additions=additions, - deletions=deletions, + url=push_url, )) except Exception as e: @@ -202,15 +210,10 @@ def handle_github_pull_request_event(webhook_data: dict, github_token: str, gith :param github_url_slug: :return: ''' - merge_review_only_protected_branches = os.environ.get('MERGE_REVIEW_ONLY_PROTECTED_BRANCHES_ENABLED', '0') == '1' try: # 解析Webhook数据 handler = GithubPullRequestHandler(webhook_data, github_token, github_url) logger.info('GitHub Pull Request event received') - # 如果开启了仅review projected branches的,判断当前目标分支是否为projected branches - if merge_review_only_protected_branches and not handler.target_branch_protected(): - logger.info("Merge Request target branch not match protected branches, ignored.") - return if handler.action not in ['opened', 'synchronize']: logger.info(f"Pull Request Hook event, action={handler.action}, ignored.") @@ -224,12 +227,6 @@ def handle_github_pull_request_event(webhook_data: dict, github_token: str, gith if not changes: logger.info('未检测到有关代码的修改,修改文件可能不满足SUPPORTED_EXTENSIONS。') return - # 统计本次新增、删除的代码总数 - additions = 0 - deletions = 0 - for item in changes: - additions += item.get('additions', 0) - deletions += item.get('deletions', 0) # 获取Pull Request的commits commits = handler.get_pull_request_commits() @@ -256,10 +253,7 @@ def handle_github_pull_request_event(webhook_data: dict, github_token: str, gith score=CodeReviewer.parse_review_score(review_text=review_result), url=webhook_data['pull_request']['html_url'], review_result=review_result, - url_slug=github_url_slug, - webhook_data=webhook_data, - additions=additions, - deletions=deletions, + url_slug=github_url_slug )) except Exception as e: diff --git a/biz/utils/knowledge_base.py b/biz/utils/knowledge_base.py new file mode 100644 index 000000000..4cde3ca3b --- /dev/null +++ b/biz/utils/knowledge_base.py @@ -0,0 +1,674 @@ +import os +import json +import uuid +from typing import List, Dict, Any, Optional +from pathlib import Path +import hashlib +import chromadb +from chromadb.config import Settings +from sentence_transformers import SentenceTransformer +import PyPDF2 +from docx import Document +import markdown +from bs4 import BeautifulSoup +import requests +import yaml +from biz.utils.log import logger +import re + + +class DocumentProcessor: + """文档处理器,支持多种文档格式""" + + @staticmethod + def extract_text_from_pdf(file_path: str) -> str: + """从PDF文件提取文本""" + try: + with open(file_path, 'rb') as file: + reader = PyPDF2.PdfReader(file) + text = "" + for page in reader.pages: + text += page.extract_text() + "\n" + return text.strip() + except Exception as e: + logger.error(f"PDF文件处理失败: {e}") + return "" + + @staticmethod + def extract_text_from_docx(file_path: str) -> str: + """从Word文档提取文本""" + try: + doc = Document(file_path) + text = "" + for paragraph in doc.paragraphs: + text += paragraph.text + "\n" + return text.strip() + except Exception as e: + logger.error(f"Word文档处理失败: {e}") + return "" + + @staticmethod + def extract_text_from_md(file_path: str) -> str: + """从Markdown文件提取文本""" + try: + with open(file_path, 'r', encoding='utf-8') as file: + md_content = file.read() + html = markdown.markdown(md_content) + soup = BeautifulSoup(html, 'html.parser') + return soup.get_text().strip() + except Exception as e: + logger.error(f"Markdown文件处理失败: {e}") + return "" + + @staticmethod + def extract_text_from_txt(file_path: str) -> str: + """从文本文件提取内容""" + try: + with open(file_path, 'r', encoding='utf-8') as file: + return file.read().strip() + except Exception as e: + logger.error(f"文本文件处理失败: {e}") + return "" + + @classmethod + def process_document(cls, file_path: str) -> str: + """根据文件类型处理文档""" + ext = Path(file_path).suffix.lower() + + if ext == '.pdf': + return cls.extract_text_from_pdf(file_path) + elif ext == '.docx': + return cls.extract_text_from_docx(file_path) + elif ext == '.md': + return cls.extract_text_from_md(file_path) + elif ext in ['.txt', '.py', '.js', '.java', '.cpp', '.c', '.go']: + return cls.extract_text_from_txt(file_path) + else: + logger.warning(f"不支持的文件类型: {ext}") + return "" + + +class TextSplitter: + """文本分割器""" + + def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200): + self.chunk_size = chunk_size + self.chunk_overlap = chunk_overlap + + def split_text(self, text: str) -> List[str]: + """将文本分割成块""" + if len(text) <= self.chunk_size: + return [text] + + chunks = [] + start = 0 + + while start < len(text): + end = start + self.chunk_size + + # 尝试在句子边界分割 + if end < len(text): + # 寻找最近的句号、问号或感叹号 + sentence_ends = ['.', '?', '!', '\n', '。', '?', '!'] + for i in range(end, max(start + self.chunk_size - 200, start), -1): + if text[i] in sentence_ends: + end = i + 1 + break + + chunk = text[start:end].strip() + if chunk: + chunks.append(chunk) + + start = end - self.chunk_overlap + + return chunks + + +class KnowledgeBase: + """知识库管理器""" + + def __init__(self, db_path: str = "data/knowledge_base"): + self.db_path = db_path + self.client = chromadb.PersistentClient( + path=db_path, + settings=Settings(allow_reset=True) + ) + # 使用本地模型路径 + project_root = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) + model_path = os.path.join(project_root, 'model', 'all-MiniLM-L6-v2') + + # 检查本地模型是否存在 + if os.path.exists(model_path): + logger.info(f"使用本地模型: {model_path}") + self.model = SentenceTransformer(model_path) + else: + logger.warning(f"本地模型路径不存在: {model_path}, 使用在线模型") + self.model = SentenceTransformer('all-MiniLM-L6-v2') + self.text_splitter = TextSplitter() + self.doc_processor = DocumentProcessor() + + # 创建集合,使用余弦相似度 + self.custom_collection = self._get_or_create_collection( + "custom_knowledge", + metadata={"hnsw:space": "cosine"} # 使用余弦相似度 + ) + self.builtin_collection = self._get_or_create_collection( + "builtin_knowledge", + metadata={"hnsw:space": "cosine"} # 使用余弦相似度 + ) + + # 检查是否需要初始化内置知识库 + config = self._load_builtin_config() + # 可以通过环境变量禁用自动初始化 + auto_init = os.getenv("AUTO_INIT_BUILTIN_KNOWLEDGE", "1") == "1" + if config.get("settings", {}).get("auto_init", True) and auto_init: + # 检查内置集合是否为空 + try: + existing_docs = self.builtin_collection.get(include=["metadatas"]) + # 更严格的检查:确保真的有文档内容 + if not existing_docs['metadatas'] or len(existing_docs['metadatas']) == 0: + logger.info("内置知识库为空,开始初始化...") + self._init_builtin_knowledge() + else: + # 检查是否有有效的文档(不是空文档) + valid_docs = [doc for doc in existing_docs['metadatas'] if doc.get('title') and doc.get('title').strip()] + if not valid_docs: + logger.info("内置知识库中没有有效文档,开始初始化...") + self._init_builtin_knowledge() + else: + logger.info(f"内置知识库已存在 {len(valid_docs)} 个有效文档,跳过初始化") + except Exception as e: + logger.warning(f"检查内置知识库状态失败: {e},跳过自动初始化") + else: + logger.info("自动初始化内置知识库已禁用") + + def _get_or_create_collection(self, name: str, metadata: dict = None): + """获取或创建集合""" + try: + return self.client.get_collection(name) + except: + return self.client.create_collection( + name, + metadata=metadata + ) + + def _load_builtin_config(self) -> Dict[str, Any]: + """加载内置知识库配置""" + config_path = "conf/builtin_knowledge.yml" + try: + with open(config_path, 'r', encoding='utf-8') as f: + return yaml.safe_load(f) + except FileNotFoundError: + logger.warning(f"配置文件不存在: {config_path},使用空配置") + return {"builtin_documents": [], "settings": {"enabled": True}} + except Exception as e: + logger.error(f"加载配置文件失败: {e}") + return {"builtin_documents": [], "settings": {"enabled": True}} + + def _init_builtin_knowledge(self): + """从配置文件和文档文件初始化内置知识库""" + config = self._load_builtin_config() + + # 检查是否禁用内置知识库 + if not config.get("settings", {}).get("enabled", True): + logger.info("内置知识库已禁用") + return + + builtin_docs = config.get("builtin_documents", []) + if not builtin_docs: + logger.warning("配置文件中没有找到内置文档配置") + return + + # 加载每个内置文档 + loaded_count = 0 + for doc_config in builtin_docs: + try: + title = doc_config.get("title", "未知文档") + file_path = doc_config.get("file", "") + tags = doc_config.get("tags", []) + + if not file_path: + logger.warning(f"文档 {title} 没有指定文件路径") + continue + + # 检查文件是否存在 + if not os.path.exists(file_path): + logger.warning(f"文档文件不存在: {file_path}") + continue + + # 读取文档内容 + content = self.doc_processor.process_document(file_path) + if not content.strip(): + logger.warning(f"文档 {title} 内容为空") + continue + + # 添加到知识库 + self.add_builtin_document(title, content, tags) + loaded_count += 1 + logger.info(f"✅ 成功加载内置文档: {title}") + + except Exception as e: + logger.error(f"❌ 加载内置文档失败 {doc_config.get('title', '未知')}: {e}") + + logger.info(f"内置知识库初始化完成,成功加载 {loaded_count} 个文档") + + def add_custom_document(self, title: str, file_path: str, tags: List[str] = None) -> str: + """添加自定义文档到知识库""" + try: + # 处理文档 + content = self.doc_processor.process_document(file_path) + if not content: + raise ValueError("文档内容为空") + + return self._add_document(self.custom_collection, title, content, tags or [], "custom") + except Exception as e: + logger.error(f"添加自定义文档失败: {e}") + raise + + def add_builtin_document(self, title: str, content: str, tags: List[str] = None) -> str: + """添加内置文档到知识库""" + return self._add_document(self.builtin_collection, title, content, tags or [], "builtin") + + def _add_document(self, collection, title: str, content: str, tags: List[str], source: str) -> str: + """内部方法:添加文档到指定集合""" + # 分割文本 + chunks = self.text_splitter.split_text(content) + + # 生成文档ID + doc_id = hashlib.md5(f"{title}_{content[:100]}".encode()).hexdigest()[:8] + + # 准备数据 + chunk_ids = [] + chunk_texts = [] + chunk_metadatas = [] + + for i, chunk in enumerate(chunks): + chunk_id = f"{doc_id}_chunk_{i}" + chunk_ids.append(chunk_id) + chunk_texts.append(chunk) + chunk_metadatas.append({ + "doc_id": doc_id, + "title": title, + "chunk_index": i, + "tags": ",".join(tags), + "source": source + }) + + # 向量化并存储 + embeddings = self.model.encode(chunk_texts).tolist() + + collection.add( + ids=chunk_ids, + documents=chunk_texts, + metadatas=chunk_metadatas, + embeddings=embeddings + ) + + logger.info(f"文档已添加: {title}, 分割为 {len(chunks)} 个块") + return doc_id + + def search_relevant_documents(self, query: str, n_results: int = 5, source: str = "all", similarity_threshold: float = 0.0) -> List[Dict[str, Any]]: + """搜索相关文档 + Args: + query: 搜索查询 + n_results: 返回结果数量 + source: 搜索范围,可选值: all, custom, builtin + similarity_threshold: 相似度阈值,取值范围[0,1],只返回相似度大于等于该值的结果 + """ + query_embedding = self.model.encode([query]).tolist() + + results = [] + + # 选择搜索的集合 + collections_to_search = [] + if source in ["all", "custom"]: + collections_to_search.append(("custom", self.custom_collection)) + if source in ["all", "builtin"]: + collections_to_search.append(("builtin", self.builtin_collection)) + + for source_name, collection in collections_to_search: + try: + # 检查集合是否为空 + collection_count = collection.count() + if collection_count == 0: + logger.info(f"{source_name} 集合为空,跳过搜索") + continue + + # 确保n_results大于0 + actual_n_results = max(1, min(n_results, collection_count)) + + search_results = collection.query( + query_embeddings=query_embedding, + n_results=actual_n_results, + include=["documents", "metadatas", "distances"] + ) + + if search_results['documents'] and len(search_results['documents'][0]) > 0: + for i in range(len(search_results['documents'][0])): + similarity_score = 1 - search_results['distances'][0][i] # cosine distance转换为相似度 + # 只添加相似度大于等于阈值的结果 + if similarity_score >= similarity_threshold: + results.append({ + "content": search_results['documents'][0][i], + "metadata": search_results['metadatas'][0][i], + "score": similarity_score, + "source": source_name + }) + except Exception as e: + logger.error(f"搜索 {source_name} 集合失败: {e}") + + # 按相似度排序 + results.sort(key=lambda x: x['score'], reverse=True) + + return results[:n_results] + + def search_relevant_documents_with_full_docs(self, query: str, n_results: int = 5, source: str = "all", similarity_threshold: float = 0.2) -> List[Dict[str, Any]]: + """搜索相关文档,当文档块相似度大于阈值时返回完整文档 + + Args: + query: 搜索查询 + n_results: 返回结果数量 + source: 搜索范围,可选值: all, custom, builtin + similarity_threshold: 相似度阈值,取值范围[0,1],当文档块相似度大于该值时返回完整文档 + + Returns: + List[Dict[str, Any]]: 相关文档列表,包含完整文档内容 + """ + query_embedding = self.model.encode([query]).tolist() + + # 先进行常规搜索获取相关chunk + chunk_results = self.search_relevant_documents(query, n_results * 3, source, similarity_threshold) + + # 收集需要获取完整文档的doc_id + doc_ids_to_fetch = set() + for result in chunk_results: + if result['score'] >= similarity_threshold: + doc_id = result['metadata']['doc_id'] + doc_ids_to_fetch.add(doc_id) + + # 获取完整文档内容 + full_docs = {} + collections_to_search = [] + if source in ["all", "custom"]: + collections_to_search.append(("custom", self.custom_collection)) + if source in ["all", "builtin"]: + collections_to_search.append(("builtin", self.builtin_collection)) + + for source_name, collection in collections_to_search: + try: + # 获取集合中所有数据 + all_data = collection.get(include=["documents", "metadatas"]) + + # 按doc_id分组 + doc_chunks = {} + for i, metadata in enumerate(all_data['metadatas']): + doc_id = metadata['doc_id'] + if doc_id in doc_ids_to_fetch: + if doc_id not in doc_chunks: + doc_chunks[doc_id] = { + 'title': metadata['title'], + 'chunks': [], + 'source': source_name, + 'tags': metadata['tags'] + } + doc_chunks[doc_id]['chunks'].append({ + 'content': all_data['documents'][i], + 'chunk_index': metadata['chunk_index'] + }) + + # 合并chunk并按索引排序 + for doc_id, doc_info in doc_chunks.items(): + doc_info['chunks'].sort(key=lambda x: x['chunk_index']) + full_content = '\n\n'.join([chunk['content'] for chunk in doc_info['chunks']]) + full_docs[doc_id] = { + 'title': doc_info['title'], + 'content': full_content, + 'source': doc_info['source'], + 'tags': doc_info['tags'], + 'chunk_count': len(doc_info['chunks']) + } + + except Exception as e: + logger.error(f"获取 {source_name} 完整文档失败: {e}") + + # 构建最终结果 + results = [] + for result in chunk_results: + doc_id = result['metadata']['doc_id'] + if doc_id in full_docs: + # 使用完整文档内容 + results.append({ + "content": full_docs[doc_id]['content'], + "metadata": { + "doc_id": doc_id, + "title": full_docs[doc_id]['title'], + "tags": full_docs[doc_id]['tags'], + "source": full_docs[doc_id]['source'], + "chunk_count": full_docs[doc_id]['chunk_count'], + "is_full_document": True + }, + "score": result['score'], + "source": full_docs[doc_id]['source'] + }) + # 从full_docs中移除,避免重复 + del full_docs[doc_id] + else: + # 使用原始chunk内容 + results.append(result) + + # 按相似度排序并限制结果数量 + results.sort(key=lambda x: x['score'], reverse=True) + return results[:n_results] + + def get_knowledge_for_code_review(self, code_content: str, similarity_threshold: float = 0.2) -> List[Dict[str, Any]]: + """获取代码审查相关的知识文档 + + Args: + code_content: 代码内容 + similarity_threshold: 相似度阈值,当文档块相似度大于该值时返回完整文档 + + Returns: + List[Dict[str, Any]]: 相关文档列表 + """ + # 定义语言特征规则 + language_patterns = { + "python": { + "keywords": [ + (r"\bdef\s+\w+\s*\(", 3), # 函数定义 + (r"\bclass\s+\w+[:\(]", 3), # 类定义 + (r"\bimport\s+[\w\s,]+", 2), # import语句 + (r"from\s+[\w\.]+\s+import", 2), # from import语句 + (r"@\w+", 1), # 装饰器 + (r":\s*$", 1), # 代码块开始 + (r"__\w+__", 1), # 魔术方法 + (r"self\.", 1), # self引用 + ], + "libraries": ["django", "flask", "requests", "numpy", "pandas", "tensorflow", "pytorch"] + }, + "javascript": { + "keywords": [ + (r"\bconst\s+\w+\s*=", 3), # const声明 + (r"\blet\s+\w+\s*=", 3), # let声明 + (r"=>\s*{", 2), # 箭头函数 + (r"\bfunction\s+\w+\s*\(", 2), # 函数声明 + (r"\bimport\s+.*\bfrom\b", 2), # ES6 import + (r"\bexport\s+", 1), # export语句 + (r"\bawait\b", 1), # async/await + ], + "libraries": ["react", "vue", "angular", "express", "node", "axios"] + }, + "java": { + "keywords": [ + (r"\bclass\s+\w+", 3), # 类定义 + (r"\bpublic\s+|private\s+|protected\s+", 2), # 访问修饰符 + (r"@\w+", 2), # 注解 + (r"\binterface\s+\w+", 2), # 接口定义 + (r"\bextends\s+|\bimplements\s+", 1), # 继承和实现 + ], + "libraries": ["spring", "hibernate", "mybatis", "junit"] + }, + "go": { + "keywords": [ + (r"\bfunc\s+\w+\s*\(", 3), # 函数定义 + (r"\btype\s+\w+\s+struct\b", 3), # 结构体定义 + (r"\bpackage\s+\w+", 2), # 包声明 + (r"\binterface\s*{", 2), # 接口定义 + (r"\bgo\s+", 1), # goroutine + ], + "libraries": ["gin", "gorm", "echo"] + }, + "cpp": { + "keywords": [ + (r"#include\s+[<\"][\w\.]+[>\"]", 3), # include语句 + (r"\bclass\s+\w+", 3), # 类定义 + (r"\btemplate\s*<", 2), # 模板 + (r"::\s*", 1), # 作用域解析 + ], + "libraries": ["boost", "qt", "opencv"] + }, + "html": { + "keywords": [ + (r"<\w+[^>]*>", 2), # HTML标签 + (r"", 1), # 结束标签 + (r"\bclass\s*=\s*[\"']", 1), # class属性 + ], + "libraries": [] + }, + "css": { + "keywords": [ + (r"{\s*[\w\-]+\s*:", 2), # 规则块 + (r"@media\b", 2), # 媒体查询 + (r"#[\w\-]+\s*{", 1), # ID选择器 + ], + "libraries": [] + } + } + + # 检测代码语言特征 + language_scores = {} + for lang, patterns in language_patterns.items(): + score = 0 + # 检查关键字模式 + for pattern, weight in patterns["keywords"]: + matches = len(re.findall(pattern, code_content)) + score += matches * weight + + # 检查常用库 + for lib in patterns["libraries"]: + if lib in code_content.lower(): + score += 2 + + if score > 0: + language_scores[lang] = score + logger.info(f"\n--------{lang} score: {score}--------") + + # 确定主要语言 - 选择得分最高的语言 + primary_language = None + if language_scores: + primary_language = max(language_scores.items(), key=lambda x: x[1])[0] + + logger.info(f"\n--------Primary language: {primary_language}--------") + + # 如果没有检测到任何语言特征,返回空列表 + if not primary_language: + logger.info("No language features detected") + return [] + + # 构建多个搜索查询 + search_queries = [ + f"{primary_language} standards coding best practices", # 基础查询 + f"{primary_language} common pitfalls and solutions", # 常见问题 + f"{primary_language} security guidelines", # 安全指南 + f"{primary_language} performance optimization" # 性能优化 + ] + + # 合并多个查询的结果,使用新的完整文档检索方法 + all_results = [] + for query in search_queries: + results = self.search_relevant_documents_with_full_docs(query, n_results=2, source='all', similarity_threshold=similarity_threshold) + all_results.extend(results) + + # 去重并保留相似度最高的结果 + unique_results = {} + for result in all_results: + doc_id = result['metadata']['doc_id'] + if doc_id not in unique_results or result['score'] > unique_results[doc_id]['score']: + unique_results[doc_id] = result + + # 按相似度排序 + sorted_results = sorted(unique_results.values(), key=lambda x: x['score'], reverse=True) + + return sorted_results + + def list_documents(self, source: str = "all") -> List[Dict[str, Any]]: + """列出所有文档""" + docs = [] + + collections_to_list = [] + if source in ["all", "custom"]: + collections_to_list.append(("custom", self.custom_collection)) + if source in ["all", "builtin"]: + collections_to_list.append(("builtin", self.builtin_collection)) + + for source_name, collection in collections_to_list: + try: + all_data = collection.get(include=["metadatas"]) + + # 按文档分组 + doc_groups = {} + for metadata in all_data['metadatas']: + doc_id = metadata['doc_id'] + if doc_id not in doc_groups: + doc_groups[doc_id] = { + "doc_id": doc_id, + "title": metadata['title'], + "tags": metadata['tags'].split(',') if metadata['tags'] else [], + "source": source_name, + "chunk_count": 0 + } + doc_groups[doc_id]['chunk_count'] += 1 + + docs.extend(list(doc_groups.values())) + except Exception as e: + logger.error(f"列出 {source_name} 文档失败: {e}") + + return docs + + def delete_document(self, doc_id: str, source: str = "custom"): + """删除文档""" + collection = self.custom_collection if source == "custom" else self.builtin_collection + + try: + # 获取该文档的所有chunk + all_data = collection.get(include=["metadatas", "documents"]) + chunk_ids_to_delete = [] + + # 遍历所有元数据,找到匹配的文档ID + for i, metadata in enumerate(all_data['metadatas']): + if metadata.get('doc_id') == doc_id: + chunk_ids_to_delete.append(all_data['ids'][i]) + + if chunk_ids_to_delete: + # 删除所有相关的块 + collection.delete(ids=chunk_ids_to_delete) + logger.info(f"已删除文档 {doc_id},共 {len(chunk_ids_to_delete)} 个块") + else: + logger.warning(f"未找到文档 {doc_id}") + except Exception as e: + logger.error(f"删除文档失败: {e}") + raise + + def clear_builtin_collection(self): + """清空内置文档集合""" + try: + # 获取所有文档 + all_data = self.builtin_collection.get(include=["metadatas"]) + if all_data and all_data['metadatas']: + # 获取所有文档块的ID + chunk_ids = all_data['ids'] + # 删除所有文档 + self.builtin_collection.delete(ids=chunk_ids) + logger.info(f"已清空内置文档集合,共删除 {len(chunk_ids)} 个文档块") + except Exception as e: + logger.error(f"清空内置文档集合失败: {e}") + raise \ No newline at end of file diff --git a/biz/utils/rag_code_reviewer.py b/biz/utils/rag_code_reviewer.py new file mode 100644 index 000000000..98f6b9c55 --- /dev/null +++ b/biz/utils/rag_code_reviewer.py @@ -0,0 +1,196 @@ +import os +from typing import Dict, Any, List, Optional +import yaml +from jinja2 import Template + +from biz.llm.factory import Factory +from biz.utils.log import logger +from biz.utils.token_util import count_tokens, truncate_text_by_tokens +from biz.utils.knowledge_base import KnowledgeBase +from biz.utils.code_reviewer import BaseReviewer, CodeReviewer + + +class RAGCodeReviewer(BaseReviewer): + """基于RAG的代码审查器""" + + def __init__(self): + super().__init__("rag_code_review_prompt") + self.knowledge_base = KnowledgeBase() + self.enable_rag = os.getenv("ENABLE_RAG", "1") == "1" + self.similarity_threshold = float(os.getenv("RAG_SIMILARITY_THRESHOLD", "0.2")) + logger.info(f"RAG功能状态: {'启用' if self.enable_rag else '禁用'}") + logger.info(f"RAG相似度阈值: {self.similarity_threshold}") + + def _load_prompts(self, prompt_key: str, style="professional") -> Dict[str, Any]: + """加载RAG提示词配置""" + prompt_templates_file = "conf/prompt_templates.yml" + try: + with open(prompt_templates_file, "r", encoding="utf-8") as file: + prompts_config = yaml.safe_load(file) + + # 如果没有RAG配置,使用默认的代码审查配置 + if prompt_key not in prompts_config: + prompt_key = "code_review_prompt" + + prompts = prompts_config.get(prompt_key, {}) + + def render_template(template_str: str) -> str: + return Template(template_str).render(style=style) + + system_prompt = render_template(prompts["system_prompt"]) + user_prompt = render_template(prompts["user_prompt"]) + + return { + "system_message": {"role": "system", "content": system_prompt}, + "user_message": {"role": "user", "content": user_prompt}, + } + except (FileNotFoundError, KeyError, yaml.YAMLError) as e: + logger.error(f"加载提示词配置失败: {e}") + # 返回默认提示词 + return self._get_default_prompts() + + def _get_default_prompts(self) -> Dict[str, Any]: + """获取默认的RAG提示词""" + return { + "system_message": { + "role": "system", + "content": """你是一个专业的代码审查专家,具备丰富的软件开发经验。 +你的任务是基于提供的代码变更和相关技术文档,进行全面的代码审查。 + +审查重点: +1. 代码质量和规范性 +2. 潜在的bug和安全问题 +3. 性能优化建议 +4. 架构设计合理性 +5. 基于相关文档的最佳实践建议 + +请使用专业的语言风格,提供具体可行的改进建议。""" + }, + "user_message": { + "role": "user", + "content": """请审查以下代码变更: + +## 代码变更: +{diffs_text} + +## 提交信息: +{commits_text} + +## 相关技术文档: +{relevant_docs} + +请基于代码变更和相关文档,提供详细的审查意见。""" + } + } + + def get_relevant_knowledge(self, code_content: str, similarity_threshold: float = None) -> str: + """获取相关知识文档""" + if not self.enable_rag: + return "" + + # 使用实例的相似度阈值作为默认值 + if similarity_threshold is None: + similarity_threshold = self.similarity_threshold + + try: + relevant_docs = self.knowledge_base.get_knowledge_for_code_review(code_content, similarity_threshold) + + if not relevant_docs: + return "" + + knowledge_text = "\n\n".join([ + f"### {doc['metadata']['title']} (相似度: {doc['score']:.2f}){' [完整文档]' if doc['metadata'].get('is_full_document', False) else ''}\n{doc['content']}" + for doc in relevant_docs + ]) + + logger.info(f"检索到 {len(relevant_docs)} 个相关文档片段") + return knowledge_text + + except Exception as e: + logger.error(f"获取相关知识失败: {e}") + return "" + + def review_and_strip_code(self, changes_text: str, commits_text: str = "", similarity_threshold: float = None, temperature: Optional[float] = None) -> str: + """RAG增强的代码审查""" + if not changes_text: + logger.info("代码为空") + return "代码为空" + + # 使用实例的相似度阈值作为默认值 + if similarity_threshold is None: + similarity_threshold = self.similarity_threshold + + # Token限制处理 + review_max_tokens = int(os.getenv("REVIEW_MAX_TOKENS", 10000)) + tokens_count = count_tokens(changes_text) + if tokens_count > review_max_tokens: + changes_text = truncate_text_by_tokens(changes_text, review_max_tokens) + + # 获取相关知识 + relevant_docs = "" + if self.enable_rag: + relevant_docs = self.get_relevant_knowledge(changes_text, similarity_threshold) + + # 进行审查 + review_result = self.review_code(changes_text, commits_text, relevant_docs, temperature).strip() + + # 清理格式 + if review_result.startswith("```markdown") and review_result.endswith("```"): + return review_result[11:-3].strip() + return review_result + + def review_code(self, diffs_text: str, commits_text: str = "", relevant_docs: str = "", temperature: Optional[float] = None) -> str: + """基于RAG的代码审查""" + # 构建消息 + user_content = self.prompts["user_message"]["content"].format( + diffs_text=diffs_text, + commits_text=commits_text or "无提交信息", + relevant_docs=relevant_docs or "无相关文档" + ) + + messages = [ + self.prompts["system_message"], + { + "role": "user", + "content": user_content + } + ] + + # 打印相关文档信息 + # if relevant_docs: + # logger.info("\n相关文档信息:") + # logger.info(f"\n{'='*50}\n{relevant_docs}\n{'='*50}") + + return self.call_llm(messages, temperature) + + def add_knowledge_document(self, title: str, file_path: str, tags: List[str] = None) -> str: + """添加知识文档""" + try: + doc_id = self.knowledge_base.add_custom_document(title, file_path, tags) + logger.info(f"知识文档已添加: {title}") + return doc_id + except Exception as e: + logger.error(f"添加知识文档失败: {e}") + raise + + def list_knowledge_documents(self) -> List[Dict[str, Any]]: + """列出所有知识文档""" + return self.knowledge_base.list_documents() + + def delete_knowledge_document(self, doc_id: str, source: str = "custom"): + """删除知识文档""" + self.knowledge_base.delete_document(doc_id, source) + logger.info(f"知识文档已删除: {doc_id}, source: {source}") + + def restore_builtin_documents(self): + """恢复所有内置文档""" + # 先清空内置文档集合 + self.knowledge_base.clear_builtin_collection() + # 重新初始化内置文档 + self.knowledge_base._init_builtin_knowledge() + logger.info("内置文档已恢复") + + @staticmethod + def parse_review_score(review_text: str) -> int: + """解析审查评分""" + return CodeReviewer.parse_review_score(review_text) \ No newline at end of file diff --git a/conf/.env.dist b/conf/.env.dist index 0e685330a..74d24ccb0 100644 --- a/conf/.env.dist +++ b/conf/.env.dist @@ -8,14 +8,14 @@ TZ=Asia/Shanghai LLM_PROVIDER=deepseek #DeepSeek settings -DEEPSEEK_API_KEY= +DEEPSEEK_API_KEY=sk-7f956efd0c864fd5b7b9260fc7ca459c DEEPSEEK_API_BASE_URL=https://api.deepseek.com DEEPSEEK_API_MODEL=deepseek-chat #OpenAI settings -OPENAI_API_KEY=xxxx -OPENAI_API_BASE_URL=https://api.openai.com/v1 -OPENAI_API_MODEL=gpt-4o-mini +OPENAI_API_KEY=EMPTY +OPENAI_API_BASE_URL=http://127.0.0.1:9997/v1 +OPENAI_API_MODEL=Llama2-Chinese-13b-Chat-ms #ZhipuAI settings ZHIPUAI_API_KEY=xxxx @@ -31,10 +31,16 @@ QWEN_API_MODEL=qwen-coder-plus OLLAMA_API_BASE_URL=http://host.docker.internal:11434 OLLAMA_API_MODEL=deepseek-r1:latest +# 模型温度控制 (0.0-2.0) +# 0.0-0.3: 确定性高,适合代码审查 +# 0.4-0.7: 平衡创造性和一致性 +# 0.8-2.0: 创造性高,输出更随机 +LLM_TEMPERATURE=0.3 + #支持review的文件类型 -SUPPORTED_EXTENSIONS=.c,.cc,.cpp,.css,.go,.h,.java,.js,.jsx,.ts,.tsx,.md,.php,.py,.sql,.vue,.yml +SUPPORTED_EXTENSIONS=.c,.cc,.cpp,.css,.go,.h,.java,.js,.jsx,.ts,.tsx,.md,.php,.py,.sql,.vue,.yml,.html #每次 Review 的最大 Token 限制(超出部分自动截断) -REVIEW_MAX_TOKENS=10000 +REVIEW_MAX_TOKENS=30000 #Review 风格选项:professional(专业) | sarcastic(毒舌) | gentle(温和) | humorous(幽默) REVIEW_STYLE=professional @@ -50,11 +56,6 @@ WECOM_WEBHOOK_URL=https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=xxx FEISHU_ENABLED=0 FEISHU_WEBHOOK_URL=https://open.feishu.cn/open-apis/bot/v2/hook/xxx -#自定义webhook配置,使用场景:通过飞书发送应用消息可以实现Push评审通知到提交人,在自定义webhook里可以实现各种定制通知功能 -#参数EXTRA_WEBHOOK_URL接收POST请求,data={ai_codereview_data: {}, webhook_data: {}},ai_codereview_data为本系统通知的数据,webhook_data为原github、gitlab hook触发的数据 -EXTRA_WEBHOOK_ENABLED=0 -EXTRA_WEBHOOK_URL=https://xxx/xxx - #日志配置 LOG_FILE=log/app.log LOG_MAX_BYTES=10485760 @@ -66,25 +67,38 @@ REPORT_CRONTAB_EXPRESSION=0 18 * * 1-5 #Gitlab配置 #GITLAB_URL={YOUR_GITLAB_URL} #部分老版本Gitlab webhook不传递URL,需要开启此配置,示例:https://gitlab.example.com -#GITLAB_ACCESS_TOKEN={YOUR_GITLAB_ACCESS_TOKEN} #系统会优先使用此GITLAB_ACCESS_TOKEN,如果未配置,则使用Webhook 传递的Secret Token +GITLAB_ACCESS_TOKEN=glpat-sxihr4Ee_qym9QEqu6GW #系统会优先使用此GITLAB_ACCESS_TOKEN,如果未配置,则使用Webhook 传递的Secret Token #Github配置(如果使用 Github 作为代码托管平台,需要配置此项) #GITHUB_ACCESS_TOKEN={YOUR_GITHUB_ACCESS_TOKEN} # 开启Push Review功能(如果不需要push事件触发Code Review,设置为0) PUSH_REVIEW_ENABLED=1 -# 开启Merge请求过滤,过滤仅当合并目标分支是受保护分支时才Review(开启此选项请确保仓库已配置受保护分支protected branches) -MERGE_REVIEW_ONLY_PROTECTED_BRANCHES_ENABLED=0 # Dashboard登录用户名和密码 DASHBOARD_USER=admin -DASHBOARD_PASSWORD=admin +DASHBOARD_PASSWORD=wengqian # queue (async, rq) QUEUE_DRIVER=async -# REDIS_HOST=redis +REDIS_HOST=redis # REDIS_HOST=127.0.0.1 # REDIS_PORT=6379 # gitlab domain slugged WORKER_QUEUE=git_test_com + +# RAG功能配置 +ENABLE_RAG=1 +# 1表示启用RAG,0表示使用原有审查方式 + +# 知识库配置 +KNOWLEDGE_BASE_PATH=data/knowledge_base +CHUNK_SIZE=1000 +CHUNK_OVERLAP=200 +SEARCH_RESULTS_LIMIT=5 +RAG_SIMILARITY_THRESHOLD=0.2 +AUTO_INIT_BUILTIN_KNOWLEDGE=0 + +# HMAC-SHA256 签名 +SECRET_KEY=fac8cf149bdd616c07c1a675c4571ccacc40d7f7fe16914cfe0f9f9d966bb773 diff --git a/conf/builtin_knowledge.yml b/conf/builtin_knowledge.yml new file mode 100644 index 000000000..e71511bd1 --- /dev/null +++ b/conf/builtin_knowledge.yml @@ -0,0 +1,49 @@ +# 内置知识库配置 +# 配置内置技术文档和最佳实践 + +builtin_documents: + - title: "HTML编码规范" + file: "docs/builtin/html_standards.md" + tags: ["html", "frontend", "coding-standards"] + description: "HTML开发的编码规范,包括语义化标签、可访问性、SEO优化等" + + - title: "CSS编码规范" + file: "docs/builtin/css_standards.md" + tags: ["css", "frontend", "coding-standards"] + description: "CSS开发的编码规范,包括命名规范、响应式设计、性能优化等" + + - title: "JavaScript编码规范" + file: "docs/builtin/javascript_standards.md" + tags: ["javascript", "frontend", "coding-standards"] + description: "JavaScript开发的编码规范,包括ES6+特性、函数式编程、性能优化等" + + - title: "Java编码规范" + file: "docs/builtin/java_standards.md" + tags: ["java", "backend", "coding-standards"] + description: "Java开发的编码规范,包括SOLID原则、并发编程、性能优化等" + + - title: "Python编码规范" + file: "docs/builtin/python_standards.md" + tags: ["python", "backend", "coding-standards"] + description: "Python开发的编码规范,包括PEP 8、类型注解、最佳实践等" + + - title: "C++编码规范" + file: "docs/builtin/cpp_standards.md" + tags: ["cpp", "backend", "coding-standards"] + description: "C++开发的编码规范,包括内存管理、RAII、并发编程等" + + - title: "Go编码规范" + file: "docs/builtin/go_standards.md" + tags: ["go", "backend", "coding-standards"] + description: "Go开发的编码规范,包括并发安全、错误处理、接口设计等" + +# 配置参数 +settings: + # 是否启用内置知识库 + enabled: true + + # 自动初始化(首次启动时加载内置文档) + auto_init: true + + # 文档编码格式 + encoding: "utf-8" \ No newline at end of file diff --git a/conf/prompt_templates.yml b/conf/prompt_templates.yml index 66258255a..7b2f9fd74 100644 --- a/conf/prompt_templates.yml +++ b/conf/prompt_templates.yml @@ -41,3 +41,85 @@ code_review_prompt: 提交历史(commits): {commits_text} + +rag_code_review_prompt: + system_prompt: |- + 你是一位资深的软件开发工程师和代码审查专家,拥有丰富的技术知识和最佳实践经验。你的任务是基于代码变更和相关技术文档进行全面的代码审查。 + + ### 代码审查目标与评分标准: + 1. 与最佳实践的符合度(35分): + - 完全符合文档中的最佳实践:30-35分 + - 大部分符合但有小问题:20-29分 + - 存在明显违背最佳实践的情况:10-19分 + - 严重违背最佳实践:0-9分 + + 2. 代码质量与安全性(35分): + - 完全符合文档中的安全规范:30-35分 + - 存在轻微安全隐患:20-29分 + - 存在明显安全风险:10-19分 + - 严重安全漏洞:0-9分 + + 3. 性能与可维护性(20分): + - 完全符合性能最佳实践:15-20分 + - 轻微性能问题:10-14分 + - 明显性能隐患:5-9分 + - 严重性能问题:0-4分 + + 4. 文档一致性(10分): + - 完全遵循文档规范:8-10分 + - 部分遵循文档规范:4-7分 + - 基本不符合文档规范:0-3分 + + ### 评分规则: + 1. 严格对照检索到的技术文档进行评分 + 2. 如果某项没有相关文档参考,该项按照一般标准评分 + 3. 对于违背文档明确规定的情况,必须在对应项目显著扣分 + 4. 发现严重安全漏洞或严重违背最佳实践时,总分不得超过60分 + + ### 审查策略: + 1. 优先参考提供的相关技术文档或编码规范 + 2. 明确指出代码与文档规范的匹配程度 + 3. 对于每个问题,都要引用相关文档作为依据 + 4. 如果发现代码与文档规范不符,需要: + - 引用具体的文档内容 + - 说明不符合的具体原因 + - 提供基于文档的改进建议 + + ### 输出格式: + 请以Markdown格式输出代码审查报告,包含: + 1. 文档匹配分析:列出代码与检索到的文档的匹配程度 + 2. 问题说明:每个问题都需要引用相关文档作为依据 + 3. 改进建议:基于文档提供具体的改进方案 + 4. 评分明细:为每个评分标准提供具体分数 + 5. 总分:格式为“总分:XX分”(例如:总分:80分),确保可通过正则表达式 r"总分[::]\s*(\d+)分?") 解析出总分。 + + ### 特别说明: + 整个评论要保持{{ style }}风格 + {% if style == 'professional' %} + 评论时请使用标准的工程术语,结合技术文档保持专业严谨。 + {% elif style == 'sarcastic' %} + 评论时请大胆使用讽刺性语言,但要确保基于文档的技术指正准确。 + {% elif style == 'gentle' %} + 评论时请多用"根据最佳实践建议"、"文档中提到可以考虑"等温和措辞。 + {% elif style == 'humorous' %} + 评论时请在技术点评中加入适当幽默元素,合理使用Emoji: + - 📚 表示参考文档 + - 💡 表示最佳实践建议 + - 🐛 表示bug + - 💥 表示严重问题 + - 🎯 表示改进建议 + {% endif %} + + user_prompt: |- + 请基于代码变更和相关技术文档,以{{ style }}风格进行代码审查。 + + ## 代码变更内容: + {diffs_text} + + ## 提交历史(commits): + {commits_text} + + ## 相关技术文档和最佳实践: + {relevant_docs} + + 请严格按照检索到的技术文档中的规范和最佳实践,对代码变更进行全面审查。对于每个发现的问题,都需要引用相关文档作为依据。 diff --git a/docs/builtin/cpp_standards.md b/docs/builtin/cpp_standards.md new file mode 100644 index 000000000..ce92fecbc --- /dev/null +++ b/docs/builtin/cpp_standards.md @@ -0,0 +1,107 @@ +# C++编码规范 + +**C++代码规范 | CPP编程标准 | C++最佳实践 | C++代码审查** + +C++编程语言的编码规范和最佳实践指南,适用于代码审查和质量控制。 + +--- + +## 1. 命名规范 + +### 1.1 通用规则 +- 使用有意义的名称 +- 避免缩写(除非广泛使用) +- 保持一致性 + +### 1.2 具体规范 +**类名**:使用PascalCase,如TaskManager、ResourceHandler +**函数名**:使用camelCase,如processTask、isValid +**变量名**:使用camelCase,如itemCount、firstName +**常量**:使用UPPER_SNAKE_CASE,如MAX_ITEMS、PI +**命名空间**:使用小写字母,如utils、database + +## 2. 内存管理 + +### 2.1 智能指针 +**原则**:优先使用智能指针,避免裸指针 + +**unique_ptr**:管理独占资源,不能被拷贝 +**shared_ptr**:管理共享资源,支持引用计数 + +**最佳实践**: +- 使用std::make_unique和std::make_shared +- 避免使用裸指针 +- 注意避免循环引用 + +### 2.2 RAII原则 +**原则**:构造函数获取资源,析构函数释放资源 + +**移动语义**:使用移动构造函数和移动赋值运算符提高性能 + +## 3. 现代C++特性 + +### 3.1 auto关键字 +**规范**:在类型明显时使用auto,提高可读性 + +**最佳实践**: +- 避免过度使用auto +- 在lambda表达式和模板中使用 + +### 3.2 范围for循环 +**使用**:简化容器遍历,提高可读性 + +### 3.3 Lambda表达式 +**使用**:创建匿名函数,适用于函数对象场景 + +**规范**: +- 使用auto接收lambda表达式 +- 合理使用捕获列表 +- 避免捕获大对象 + +## 4. 异常处理 + +### 4.1 异常安全 +**保证**:函数提供异常安全保证,保持程序状态一致 + +**策略**: +- 使用RAII管理资源 +- 避免析构函数抛出异常 +- 使用智能指针避免泄漏 + +### 4.2 异常处理最佳实践 +**原则**: +- 只对异常情况使用异常处理 +- 提供有意义异常信息 +- 使用适当异常类型 +- 避免影响性能 + +## 5. 模板编程 + +### 5.1 函数模板 +**设计**:实现泛型编程,提高代码复用性 + +**最佳实践**: +- 使用概念约束模板参数 +- 避免过度复杂模板元编程 +- 提供清晰错误信息 + +### 5.2 类模板 +**设计**:提供类型安全的泛型容器和算法 + +## 6. 并发编程 + +### 6.1 线程安全 +**设计**:确保共享数据访问线程安全 + +**策略**: +- 使用std::mutex保护共享数据 +- 使用std::atomic进行原子操作 +- 避免数据竞争和死锁 + +### 6.2 异步编程 +**模式**:使用std::future和std::async + +**最佳实践**: +- 合理使用异步操作 +- 避免过度使用线程 +- 使用线程池管理资源 \ No newline at end of file diff --git a/docs/builtin/css_standards.md b/docs/builtin/css_standards.md new file mode 100644 index 000000000..14eba6f9c --- /dev/null +++ b/docs/builtin/css_standards.md @@ -0,0 +1,83 @@ +# CSS编码规范 + +**CSS代码规范 | CSS编程标准 | CSS最佳实践 | CSS代码审查** + +CSS样式表语言的编码规范和最佳实践指南,适用于代码审查和质量控制。 + +--- + +## 1. 基本原则 + +- 使用BEM命名方法论 +- 保持代码简洁和可维护 +- 优先使用类选择器 +- 避免过度嵌套 +- 注重代码复用 + +## 2. 命名规范 + +### BEM命名方法论 +**原则**:块、元素、修饰符的组合创建清晰类名结构 + +**块(Block)**:独立组件,如header、menu、button +**元素(Element)**:块的一部分,如header__title、menu__item +**修饰符(Modifier)**:改变外观或行为,如button--primary、menu__item--active + +### 通用命名规则 +**格式规范**: +- 使用小写字母 +- 使用连字符(-)连接单词 +- 使用双下划线(__)表示元素关系 +- 使用双连字符(--)表示修饰符 + +## 3. 代码组织 + +**属性排序**:按逻辑顺序组织CSS属性 + +**属性分组**: +- 定位:position、top、right、bottom、left、z-index +- 盒模型:display、float、width、height、margin、padding、border +- 排版:font、line-height、text-align、word-wrap +- 视觉效果:background、color、opacity、box-shadow +- 其他:cursor、overflow、transition + +## 4. 响应式设计 + +**移动优先**:先为移动设备编写样式,再用媒体查询为大屏幕添加样式 +**相对单位**:优先使用rem、em、vw、vh而不是固定像素 +**断点设置**:设置合理断点,如768px(平板)、1024px(桌面) +**布局适配**:使用百分比、flexbox或grid布局 + +## 5. 性能优化 + +**文件优化**: +- 避免@import,使用link标签 +- 合并和压缩CSS文件 +- 使用CSS Sprites减少图片请求 +- 移除未使用的CSS代码 + +**选择器优化**: +- 避免复杂选择器 +- 优先使用类选择器 +- 避免通配符选择器 +- 减少嵌套层级 + +**动画优化**: +- 优先使用CSS3动画 +- 使用transform和opacity +- 避免影响页面布局 +- 使用will-change优化性能 + +## 6. 浏览器兼容性 + +**前缀处理**:使用Autoprefixer自动处理浏览器前缀 +**浏览器测试**:测试主流浏览器兼容性 +**优雅降级**:为不支持新特性的浏览器提供基础样式 +**样式重置**:使用normalize.css确保一致基础样式 + +## 7. CSS变量使用 + +**变量定义**:在:root中定义可复用值,如颜色、字体、间距 +**变量命名**:使用kebab-case,以--开头,如--primary-color +**变量使用**:使用var()函数引用变量 +**变量作用域**:遵循CSS级联规则,可重新定义 \ No newline at end of file diff --git a/docs/builtin/go_standards.md b/docs/builtin/go_standards.md new file mode 100644 index 000000000..5dd437727 --- /dev/null +++ b/docs/builtin/go_standards.md @@ -0,0 +1,55 @@ +# Go编码规范 + +**Go代码规范 | Go编程标准 | Go最佳实践 | Go代码审查** + +Go编程语言的编码规范和最佳实践指南,适用于代码审查和质量控制。 + +--- + +## 1. 基本原则 + +- 简洁性和可读性优先 +- 遵循Go的惯用语法 +- 使用gofmt格式化代码 +- 编写文档注释 +- 注重错误处理 + +## 2. 命名规范 + +**包名**:使用小写单词,如userservice、database +**接口名**:以er结尾,如Reader、Writer、Handler +**结构体和方法**:使用PascalCase,如UserManager、CreateUser +**常量**:使用PascalCase或全大写,如StatusActive、MAX_RETRIES + +## 3. 错误处理 + +**原则**:显式错误处理,每个可能出错的函数返回error类型 + +**策略**: +- 检查参数有效性 +- 使用fmt.Errorf包装错误 +- 使用errors.New创建简单错误 +- 使用自定义错误类型 + +## 4. 并发编程 + +### Goroutines +**原则**:轻量级线程,注意资源管理和同步 + +**最佳实践**: +- 避免创建过多Goroutine +- 使用sync.WaitGroup等待完成 +- 使用channel进行通信 +- 注意生命周期管理 + +### Channels +**规范**:Goroutine间通信的主要方式 + +**最佳实践**: +- 无缓冲channel用于同步通信 +- 有缓冲channel用于异步通信 +- 使用select处理多个channel +- 及时关闭不再使用的channel + +### 同步机制 +**互斥锁**:使用sync.Mutex保护共享资源,用defer确保释放 \ No newline at end of file diff --git a/docs/builtin/html_standards.md b/docs/builtin/html_standards.md new file mode 100644 index 000000000..729e9aac3 --- /dev/null +++ b/docs/builtin/html_standards.md @@ -0,0 +1,57 @@ +# HTML编码规范 + +**HTML代码规范 | HTML编程标准 | HTML最佳实践 | HTML代码审查** + +HTML标记语言的编码规范和最佳实践指南,适用于代码审查和质量控制。 + +--- + +## 1. 基本原则 + +- 使用HTML5文档类型:`` +- 使用语义化标签:`
`, `