diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json index baaf577..90ee7bd 100644 --- a/.claude-plugin/plugin.json +++ b/.claude-plugin/plugin.json @@ -1,6 +1,6 @@ { "name": "autocode", - "version": "0.7.0", + "version": "0.8.0", "description": "Claude Code plugin for competitive programming problem-setting workflows.", "author": { "name": "SummerOneTwo", diff --git a/CHANGELOG.md b/CHANGELOG.md index cbb5906..9efa130 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,14 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.8.0] - 2026-04-28 + +### Improvements + +- **最终测试数据配比约束**: `problem_generate_tests` 采样策略更新为优先保证最终测试集中 `type=3/4`(extreme + tle)不少于一半(候选不足时尽量满足),并返回 `limit_case_count`、`limit_case_minimum_required`、`limit_case_quota_met` 统计字段。 +- **验证阶段硬约束**: `problem_verify_tests` 新增 `limit_ratio` 校验(默认启用),基于生成 manifest 强制检查最终测试中 `type=3/4` 是否达到至少一半,不满足将直接验证失败;可通过 `enable_limit_ratio=false` 显式关闭。 +- **文档与工作流同步**: 更新 README、workflow skill、agent 提示与 prompts 文案,统一说明“最终测试至少一半极限数据”的质量门槛。 + ## [0.7.0] - 2026-04-27 ### Features diff --git a/CLAUDE.md b/CLAUDE.md index 58c8d7f..accd1ee 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -102,7 +102,7 @@ AutoCode/ 5. 构建生成器 (`generator_build`) 6. 运行压力测试 (`stress_test_run`, completed_rounds == total_rounds) 7. 按需构建检查器 (`checker_build`, accuracy >= 0.9) -8. 生成测试数据 (`problem_generate_tests`, generated_test_count > 0) +8. 生成测试数据(`problem_generate_tests`, generated_test_count > 0,且最终 extreme/tle 至少占一半;候选不足时尽量满足) 9. 验证测试数据 (`problem_verify_tests`, passed) 10. 打包 Polygon (`problem_pack_polygon`) diff --git a/README.md b/README.md index c0b2315..81e0bb1 100644 --- a/README.md +++ b/README.md @@ -246,7 +246,8 @@ AutoCode 提供 15 个原子工具,分为 7 组。所有工具返回统一格 | 工具 | 描述 | 关键参数 | |------|------|----------| | `problem_create` | 初始化题目目录 | `problem_dir`, `problem_name` | -| `problem_generate_tests` | 生成最终测试数据 | `problem_dir`, `test_count` | +| `problem_generate_tests` | 生成最终测试数据(最终数据集中 extreme/tle 至少占一半,候选不足时尽量满足) | `problem_dir`, `test_count` | +| `problem_verify_tests` | 验证测试数据质量(含 extreme/tle 占比硬校验) | `problem_dir`, `tests_dir`, `verify_types` | | `problem_pack_polygon` | 打包为 Polygon 格式 | `problem_dir`, `time_limit`, `memory_limit` | ## 工作流教程:A+B 问题 @@ -378,6 +379,8 @@ problem_generate_tests( ) ``` +说明:最终写入的测试中,`extreme`(type=3)与 `tle`(type=4)合计不少于一半;若候选里极限类不足,则会在可用候选范围内尽量满足并返回对应统计字段。 + ### 步骤 7:打包为 Polygon 格式 ```python @@ -477,6 +480,8 @@ problem_pack_polygon( | `extreme` | 3 | 边界情况:溢出、精度、hash 碰撞 | | `tle` | 4 | 诱导 TLE 的性能测试数据 | +`problem_generate_tests` 的默认采样策略会优先保证最终测试集中 `extreme` + `tle` 至少占 50%,剩余名额再按配置平衡分配(或按确定性顺序填充)。 + ### 文件结构 ``` diff --git a/agents/autocode-workflow.md b/agents/autocode-workflow.md index b3b0443..5a78c20 100644 --- a/agents/autocode-workflow.md +++ b/agents/autocode-workflow.md @@ -25,4 +25,6 @@ Always work through this sequence unless the task is explicitly outside problem When the user asks for a later step directly, explain which prerequisite step is missing and complete the missing work first. +When running `problem_generate_tests`, enforce test quality: final test data should contain at least half limit-oriented cases (`type=3` extreme + `type=4` tle) when candidate availability allows. + Treat hook feedback as authoritative. If a hook denies a tool call, fix the workflow gap instead of retrying the same call. diff --git a/pyproject.toml b/pyproject.toml index 2efc2dc..62dafa0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "autocode-mcp" -version = "0.7.0" +version = "0.8.0" description = "MCP Server for competitive programming problem creation, based on AutoCode paper" readme = "README.md" requires-python = ">=3.10" diff --git a/scripts/workflow_guard.py b/scripts/workflow_guard.py index 763b2cf..982b563 100644 --- a/scripts/workflow_guard.py +++ b/scripts/workflow_guard.py @@ -270,7 +270,7 @@ def session_start() -> int: "stress_test_run(completed_rounds == total_rounds) -> " "checker_build if needed (accuracy >= 0.9) -> " "problem_validate(validation_passed) -> " - "problem_generate_tests(generated_test_count > 0) -> " + "problem_generate_tests(generated_test_count > 0, and prefer >=50% type3/type4 in final tests when candidates are sufficient) -> " "problem_verify_tests(passed) -> problem_pack_polygon. " "If a hook blocks a step, complete the missing prerequisite instead of retrying blindly." ) diff --git a/skills/autocode-workflow/SKILL.md b/skills/autocode-workflow/SKILL.md index ad2b211..1ae7e86 100644 --- a/skills/autocode-workflow/SKILL.md +++ b/skills/autocode-workflow/SKILL.md @@ -61,7 +61,7 @@ Based on the paper "AutoCode: LLMs as Problem Setters for Competitive Programmin │ Phase 8: Test Generation │ │ ┌────────────────────┴────────────────────┐ │ │ │ problem_generate_tests │ Generate final test data │ -│ │ (dedup + validator filter + balance) │ │ +│ │ (dedup + validator filter + extreme>=50%)│ │ │ └────────────────────┬────────────────────┘ │ │ │ │ │ Phase 9: Packaging │ @@ -235,6 +235,7 @@ Required: problem_dir Recommended: test_count=50, enable_dedup=true, enable_validator_filter=true Output: tests/01.in ~ tests/50.in + corresponding .ans files Verify: Check generated_tests count matches test_count +Quality Gate: In final tests, type 3/4 (extreme + tle) should be >= ceil(test_count/2) when candidates are sufficient ``` ### Phase 9: Packaging @@ -283,7 +284,7 @@ Generate 3-5 mutant solutions with common bugs: | 5 | `stress_test_run` | Step 4 | `"All N rounds passed"` | | 6 | `checker_build` (optional) | Step 5 | `accuracy >= 0.9` | | 7 | `problem_validate` | Step 5 or 6 | `success=true`, all samples passed | -| 8 | `problem_generate_tests` | Step 7 | `generated_tests == test_count` | +| 8 | `problem_generate_tests` | Step 7 | `generated_tests == test_count` and `type3+type4 >= ceil(test_count/2)` (if candidates sufficient) | | 9 | `problem_pack_polygon` | Step 8 | `success=true` | ### FORBIDDEN Actions @@ -335,6 +336,7 @@ Before considering the problem complete: - [ ] Statement samples validated (problem_validate passed) - [ ] Sample files validated (problem_validate passed) - [ ] Final test data generated (50+ tests) +- [ ] Final test data has at least 50% extreme/tle cases when candidate pool allows - [ ] Polygon package created ## Example Complete Workflow diff --git a/src/autocode_mcp/__init__.py b/src/autocode_mcp/__init__.py index 2988cc1..fe52524 100644 --- a/src/autocode_mcp/__init__.py +++ b/src/autocode_mcp/__init__.py @@ -6,7 +6,7 @@ """ import os -__version__ = "0.7.0" +__version__ = "0.8.0" # 获取 templates 目录路径(包内目录) _PACKAGE_DIR = os.path.dirname(__file__) diff --git a/src/autocode_mcp/prompts/__init__.py b/src/autocode_mcp/prompts/__init__.py index 0336fcf..1f697fd 100644 --- a/src/autocode_mcp/prompts/__init__.py +++ b/src/autocode_mcp/prompts/__init__.py @@ -62,7 +62,8 @@ ## 3. 后处理 - 使用 Validator 过滤无效输入 - 去重(基于 signature) -- 平衡分布 +- 先保证最终测试中至少一半是 extreme/tle(type=3/4,候选不足时尽量满足) +- 再平衡分布 - 采样 ## 质量指标 @@ -141,8 +142,9 @@ ### 后处理 1. Validator 过滤 2. 去重(MD5 signature) -3. 平衡分布 -4. 采样 +3. 先保证最终测试中 extreme/tle(type=3/4)不少于一半(候选不足时尽量满足) +4. 对剩余名额平衡分布 +5. 采样 """ # Checker 构建提示词 diff --git a/src/autocode_mcp/tools/problem.py b/src/autocode_mcp/tools/problem.py index b98b377..d6b1e38 100644 --- a/src/autocode_mcp/tools/problem.py +++ b/src/autocode_mcp/tools/problem.py @@ -5,6 +5,7 @@ from __future__ import annotations import hashlib +import json import os import shutil from dataclasses import dataclass @@ -24,6 +25,11 @@ class CandidateTest: signature: str +# 最终测试集中「极限类」占比下限:至少一半来自 generator type 3/4(extreme + TLE 压力) +_LIMIT_STRATEGY_TYPES = frozenset({"3", "4"}) +_TEST_MANIFEST_FILENAME = ".autocode_tests_manifest.json" + + class ProblemCreateTool(Tool): """创建题目目录结构。""" @@ -125,6 +131,7 @@ def description(self) -> str: - 使用 gen.cpp 生成测试数据 - 使用 sol.cpp 生成答案 - 支持去重、平衡、采样 + - 最终测试集中至少一半为极限类(generator type=3 extreme 与 type=4 tle),在候选不足时可能无法完全满足 生成 01.in ~ N.in 及对应的 .ans 文件。 @@ -223,7 +230,7 @@ def input_schema(self) -> dict: }, "enable_balance": { "type": "boolean", - "description": "启用平衡分布(确保各策略类型均衡)", + "description": "启用平衡分布:在已满足「至少一半为 extreme/tle」后,将剩余名额在各非极限类型间尽量均衡分配;关闭时剩余名额按确定性的 (type_param, signature) 顺序填充", "default": True, }, "oversample_ratio": { @@ -255,8 +262,8 @@ async def execute( 1. 生成超额候选数据 2. 去重(基于 MD5 signature) 3. Validator 过滤(自动检测 val.exe) - 4. 平衡分布(按策略类型) - 5. 采样最终 test_count 个 + 4. 采样:至少一半为 type=3/4(极限 + TLE 压力),其余再平衡或按签名排序 + 5. 输出最终 test_count 个 """ # 验证 constraints 参数 if constraints: @@ -434,18 +441,17 @@ async def execute( seed += 1 - # 平衡分布和采样 - if enable_balance and len(candidates) > test_count: - final_tests = self._balance_and_sample(candidates, test_count) - elif len(candidates) > test_count: - # 简单确定性采样(按 signature 排序) - sorted_candidates = sorted(candidates, key=lambda c: c.signature) - final_tests = sorted_candidates[:test_count] + # 极限占比 + 平衡/确定性采样 + if len(candidates) > test_count: + final_tests = self._balance_and_sample( + candidates, test_count, balance_remainder=enable_balance + ) else: final_tests = candidates # 写入文件 generated_tests = [] + test_manifest: list[dict[str, str | int]] = [] for i, candidate in enumerate(final_tests, 1): test_file = os.path.join(tests_dir, f"{i:02d}.in") ans_file = os.path.join(tests_dir, f"{i:02d}.ans") @@ -456,6 +462,28 @@ async def execute( f.write(candidate.output_data) generated_tests.append(i) + test_manifest.append( + { + "index": i, + "in_file": f"{i:02d}.in", + "ans_file": f"{i:02d}.ans", + "type_param": candidate.type_param, + "signature": candidate.signature, + } + ) + + manifest_path = os.path.join(tests_dir, _TEST_MANIFEST_FILENAME) + with open(manifest_path, "w", encoding="utf-8") as f: + json.dump( + { + "version": 1, + "limit_strategy_types": sorted(_LIMIT_STRATEGY_TYPES), + "tests": test_manifest, + }, + f, + ensure_ascii=False, + indent=2, + ) # 统计信息 type_counts: dict[str, int] = {} @@ -467,6 +495,10 @@ async def execute( type_names.get(k, k): v for k, v in type_counts.items() } + limit_in_final = sum(1 for c in final_tests if c.type_param in _LIMIT_STRATEGY_TYPES) + limit_minimum = (len(final_tests) + 1) // 2 if final_tests else 0 + limit_quota_met = len(final_tests) == 0 or limit_in_final >= limit_minimum + if len(generated_tests) == test_count: return ToolResult.ok( tests_dir=tests_dir, @@ -475,6 +507,9 @@ async def execute( dedup_enabled=enable_dedup, validator_filter_enabled=validator_available, balance_enabled=enable_balance, + limit_case_count=limit_in_final, + limit_case_minimum_required=limit_minimum, + limit_case_quota_met=limit_quota_met, candidates_generated=len(candidates), sol_name=effective_sol_name, message=f"Generated {len(generated_tests)} test cases (from {len(candidates)} candidates)", @@ -485,6 +520,9 @@ async def execute( generated_tests=generated_tests, errors=errors, sol_name=effective_sol_name, + limit_case_count=limit_in_final, + limit_case_minimum_required=limit_minimum, + limit_case_quota_met=limit_quota_met, ) def _resolve_tests_dir( @@ -533,7 +571,11 @@ def _clear_generated_tests(self, tests_dir: str) -> ToolResult | None: """创建测试目录并清理旧的 .in/.ans 文件。""" os.makedirs(tests_dir, exist_ok=True) for filename in os.listdir(tests_dir): - if not (filename.endswith(".in") or filename.endswith(".ans")): + if not ( + filename.endswith(".in") + or filename.endswith(".ans") + or filename == _TEST_MANIFEST_FILENAME + ): continue path = os.path.join(tests_dir, filename) if os.path.isfile(path): @@ -541,41 +583,89 @@ def _clear_generated_tests(self, tests_dir: str) -> ToolResult | None: return None def _balance_and_sample( - self, candidates: list[CandidateTest], target_count: int + self, + candidates: list[CandidateTest], + target_count: int, + balance_remainder: bool = True, ) -> list[CandidateTest]: - """平衡分布并采样。 + """采样:至少一半为极限类(type 3/4),其余再分配。 - 确保各策略类型的测试数据数量均衡。 - 使用确定性排序保证结果可重现。 + 先取不少于 ceil(target_count/2) 条来自 extreme/tle 的候选(若候选不足则全取), + 再在剩余候选中填满 target_count;剩余部分在 balance_remainder 为真时在 + 各类型间均衡,否则按 (type_param, signature) 确定性排序依次选取。 """ - # 按类型分组 - by_type: dict[str, list[CandidateTest]] = {} - for c in candidates: - if c.type_param not in by_type: - by_type[c.type_param] = [] - by_type[c.type_param].append(c) - - # 计算每种类型应该采样多少 - num_types = len(by_type) - if num_types == 0: + if target_count <= 0 or not candidates: return [] - base_count = target_count // num_types - remainder = target_count % num_types + need_limit = (target_count + 1) // 2 # 不少于一半(向上取整到整数条数) + extreme_pool = sorted( + [c for c in candidates if c.type_param in _LIMIT_STRATEGY_TYPES], + key=lambda c: (c.type_param, c.signature), + ) result: list[CandidateTest] = [] - type_order = sorted(by_type.keys()) # 确保确定性 - - for i, type_param in enumerate(type_order): - type_candidates = by_type[type_param] - # 前 remainder 个类型多分配一个 - count = base_count + (1 if i < remainder else 0) - # 使用确定性排序而非随机采样,保证结果可重现 - sorted_candidates = sorted(type_candidates, key=lambda c: c.signature) - if len(sorted_candidates) <= count: - result.extend(sorted_candidates) - else: - result.extend(sorted_candidates[:count]) + selected_ids: set[int] = set() + + for c in extreme_pool: + if len(result) >= need_limit: + break + cid = id(c) + if cid in selected_ids: + continue + result.append(c) + selected_ids.add(cid) + + remaining = [c for c in candidates if id(c) not in selected_ids] + need_more = target_count - len(result) + if need_more <= 0: + return result[:target_count] + + if balance_remainder: + by_type: dict[str, list[CandidateTest]] = {} + for c in remaining: + by_type.setdefault(c.type_param, []).append(c) + for t in by_type: + by_type[t] = sorted(by_type[t], key=lambda c: c.signature) + + type_order = sorted(by_type.keys()) + if not type_order: + return result[:target_count] + + num_types = len(type_order) + base_count = need_more // num_types + rem = need_more % num_types + + for i, type_param in enumerate(type_order): + count = base_count + (1 if i < rem else 0) + for c in by_type[type_param][:count]: + cid = id(c) + if cid in selected_ids: + continue + result.append(c) + selected_ids.add(cid) + if len(result) >= target_count: + break + if len(result) >= target_count: + break + + if len(result) < target_count: + for c in sorted(remaining, key=lambda c: (c.type_param, c.signature)): + if len(result) >= target_count: + break + cid = id(c) + if cid in selected_ids: + continue + result.append(c) + selected_ids.add(cid) + else: + for c in sorted(remaining, key=lambda c: (c.type_param, c.signature)): + if len(result) >= target_count: + break + cid = id(c) + if cid in selected_ids: + continue + result.append(c) + selected_ids.add(cid) return result[:target_count] diff --git a/src/autocode_mcp/tools/test_verify.py b/src/autocode_mcp/tools/test_verify.py index 147a52e..d0dafb6 100644 --- a/src/autocode_mcp/tools/test_verify.py +++ b/src/autocode_mcp/tools/test_verify.py @@ -6,6 +6,7 @@ from __future__ import annotations +import json import os from pathlib import Path @@ -13,6 +14,9 @@ from ..utils.platform import get_exe_extension from .base import Tool, ToolResult +_LIMIT_STRATEGY_TYPES = frozenset({"3", "4"}) +_TEST_MANIFEST_FILENAME = ".autocode_tests_manifest.json" + class ProblemVerifyTestsTool(Tool): """验证生成的测试数据。""" @@ -30,6 +34,7 @@ def description(self) -> str: 2. answer_consistency: 用 sol 重新运行 .in,对比输出与 .ans 3. validator: 用 val 检查每个 .in 是否满足约束(如有 val.exe) 4. no_empty: 没有空文件 + 5. limit_ratio: 最终测试中 extreme/tle(type=3/4)不少于一半(需存在 manifest) 前置条件: 1. 已运行 problem_generate_tests 生成测试数据 @@ -57,7 +62,13 @@ def input_schema(self) -> dict: "type": "array", "items": { "type": "string", - "enum": ["file_count", "answer_consistency", "validator", "no_empty"], + "enum": [ + "file_count", + "answer_consistency", + "validator", + "no_empty", + "limit_ratio", + ], }, "description": "要执行的验证类型,默认全部执行", }, @@ -65,6 +76,11 @@ def input_schema(self) -> dict: "type": "string", "description": "标准解法文件名(不含扩展名),默认 'sol'", }, + "enable_limit_ratio": { + "type": "boolean", + "description": "是否启用 extreme/tle 占比检查(默认开启;设为 false 可关闭)", + "default": True, + }, "timeout": { "type": "integer", "description": "单次执行超时(秒)", @@ -80,6 +96,7 @@ async def execute( tests_dir: str | None = None, verify_types: list[str] | None = None, sol_name: str | None = None, + enable_limit_ratio: bool = True, timeout: int = 60, ) -> ToolResult: """执行测试数据验证。""" @@ -99,6 +116,12 @@ async def execute( if not verify_types: verify_types = ["file_count", "answer_consistency", "validator", "no_empty"] + if enable_limit_ratio: + if "limit_ratio" not in verify_types: + verify_types.append("limit_ratio") + else: + verify_types = [v for v in verify_types if v != "limit_ratio"] + results = {} all_passed = True @@ -135,6 +158,13 @@ async def execute( if not result["passed"]: all_passed = False + # 5. 极限数据占比检查 + if "limit_ratio" in verify_types: + result = self._check_limit_ratio(tests_dir) + results["limit_ratio"] = result + if not result["passed"]: + all_passed = False + # 汇总 total_checks = len(results) passed_checks = sum(1 for r in results.values() if r["passed"]) @@ -147,6 +177,7 @@ async def execute( passed_checks=passed_checks, tests_dir=tests_dir, sol_name=effective_sol_name, + limit_ratio_enabled=enable_limit_ratio, message=f"All {total_checks} verification checks passed", ) else: @@ -158,6 +189,7 @@ async def execute( passed_checks=passed_checks, tests_dir=tests_dir, sol_name=effective_sol_name, + limit_ratio_enabled=enable_limit_ratio, ) def _check_file_count(self, tests_dir: str) -> dict: @@ -326,3 +358,89 @@ async def _check_validator( "total": len(in_files), "invalid": invalid, } + + def _check_limit_ratio(self, tests_dir: str) -> dict: + """检查最终测试中 type=3/4 是否不少于一半。""" + manifest_path = os.path.join(tests_dir, _TEST_MANIFEST_FILENAME) + if not os.path.exists(manifest_path): + return { + "passed": False, + "total": 0, + "limit_case_count": 0, + "limit_case_minimum_required": 0, + "limit_case_ratio": 0.0, + "error": f"manifest not found: {manifest_path}", + } + + try: + with open(manifest_path, encoding="utf-8") as f: + manifest = json.load(f) + except (json.JSONDecodeError, OSError) as e: + return { + "passed": False, + "total": 0, + "limit_case_count": 0, + "limit_case_minimum_required": 0, + "limit_case_ratio": 0.0, + "error": f"failed to read manifest: {e}", + } + + tests = manifest.get("tests", []) + if not isinstance(tests, list): + return { + "passed": False, + "total": 0, + "limit_case_count": 0, + "limit_case_minimum_required": 0, + "limit_case_ratio": 0.0, + "error": "invalid manifest format: tests must be a list", + } + + in_files = sorted(f for f in os.listdir(tests_dir) if f.endswith(".in")) + in_file_set = set(in_files) + type_by_in_file: dict[str, str] = {} + for item in tests: + if not isinstance(item, dict): + continue + in_file = item.get("in_file") + type_param = item.get("type_param") + if isinstance(in_file, str) and isinstance(type_param, str): + type_by_in_file[in_file] = type_param + + missing_in_manifest = sorted(f for f in in_files if f not in type_by_in_file) + if missing_in_manifest: + return { + "passed": False, + "total": len(in_files), + "limit_case_count": 0, + "limit_case_minimum_required": (len(in_files) + 1) // 2 if in_files else 0, + "limit_case_ratio": 0.0, + "missing_in_manifest": missing_in_manifest, + "error": "manifest does not cover all .in files", + } + + total = len(in_files) + if total == 0: + return { + "passed": False, + "total": 0, + "limit_case_count": 0, + "limit_case_minimum_required": 0, + "limit_case_ratio": 0.0, + "error": "no .in files found", + } + + limit_case_count = sum( + 1 for in_file in in_file_set if type_by_in_file[in_file] in _LIMIT_STRATEGY_TYPES + ) + minimum_required = (total + 1) // 2 + ratio = limit_case_count / total + + return { + "passed": limit_case_count >= minimum_required, + "total": total, + "limit_case_count": limit_case_count, + "limit_case_minimum_required": minimum_required, + "limit_case_ratio": ratio, + "limit_strategy_types": sorted(_LIMIT_STRATEGY_TYPES), + } diff --git a/tests/test_packaging.py b/tests/test_packaging.py index 0437a67..524453b 100644 --- a/tests/test_packaging.py +++ b/tests/test_packaging.py @@ -11,7 +11,7 @@ def test_import(): """测试模块导入。""" from autocode_mcp import __version__ - assert __version__ == "0.7.0" + assert __version__ == "0.8.0" def test_tool_result(): diff --git a/tests/test_plugin_manifest.py b/tests/test_plugin_manifest.py index 4c3adf3..8e9a513 100644 --- a/tests/test_plugin_manifest.py +++ b/tests/test_plugin_manifest.py @@ -11,7 +11,7 @@ def test_claude_plugin_manifest_links_mcp_config(): manifest = json.loads(Path(".claude-plugin/plugin.json").read_text(encoding="utf-8")) assert manifest["name"] == "autocode" - assert manifest["version"] == "0.7.0" + assert manifest["version"] == "0.8.0" def test_claude_plugin_manifest_has_interface_metadata(): diff --git a/tests/test_tools/test_problem.py b/tests/test_tools/test_problem.py index 0c31a66..c052980 100644 --- a/tests/test_tools/test_problem.py +++ b/tests/test_tools/test_problem.py @@ -2,6 +2,7 @@ Problem 工具组测试。 """ +import json import os import tempfile @@ -9,6 +10,7 @@ from autocode_mcp.tools.generator import GeneratorBuildTool from autocode_mcp.tools.problem import ( + CandidateTest, ProblemCreateTool, ProblemGenerateTestsTool, ProblemPackPolygonTool, @@ -447,6 +449,118 @@ def test_problem_verify_tests_file_count_reports_large_gaps(): assert len(result["missing_indices"]) == 98 +def test_problem_verify_tests_limit_ratio_passes_with_manifest(): + """测试极限数据占比校验通过。""" + tool = ProblemVerifyTestsTool() + + with tempfile.TemporaryDirectory() as tmpdir: + # 4 组中 2 组为 type=3/4,满足 >=50% + for i in range(1, 5): + with open(os.path.join(tmpdir, f"{i:02d}.in"), "w", encoding="utf-8") as f: + f.write("x\n") + with open(os.path.join(tmpdir, f"{i:02d}.ans"), "w", encoding="utf-8") as f: + f.write("y\n") + + manifest = { + "version": 1, + "limit_strategy_types": ["3", "4"], + "tests": [ + {"in_file": "01.in", "ans_file": "01.ans", "type_param": "1", "signature": "a"}, + {"in_file": "02.in", "ans_file": "02.ans", "type_param": "2", "signature": "b"}, + {"in_file": "03.in", "ans_file": "03.ans", "type_param": "3", "signature": "c"}, + {"in_file": "04.in", "ans_file": "04.ans", "type_param": "4", "signature": "d"}, + ], + } + with open( + os.path.join(tmpdir, ".autocode_tests_manifest.json"), + "w", + encoding="utf-8", + ) as f: + json.dump(manifest, f) + + result = tool._check_limit_ratio(tmpdir) + assert result["passed"] is True + assert result["limit_case_count"] == 2 + assert result["limit_case_minimum_required"] == 2 + + +def test_problem_verify_tests_limit_ratio_fails_when_insufficient(): + """测试极限数据占比不足时校验失败。""" + tool = ProblemVerifyTestsTool() + + with tempfile.TemporaryDirectory() as tmpdir: + # 5 组中只有 2 组 type=3/4,不满足 >=3 + for i in range(1, 6): + with open(os.path.join(tmpdir, f"{i:02d}.in"), "w", encoding="utf-8") as f: + f.write("x\n") + with open(os.path.join(tmpdir, f"{i:02d}.ans"), "w", encoding="utf-8") as f: + f.write("y\n") + + manifest = { + "version": 1, + "limit_strategy_types": ["3", "4"], + "tests": [ + {"in_file": "01.in", "ans_file": "01.ans", "type_param": "1", "signature": "a"}, + {"in_file": "02.in", "ans_file": "02.ans", "type_param": "2", "signature": "b"}, + {"in_file": "03.in", "ans_file": "03.ans", "type_param": "2", "signature": "c"}, + {"in_file": "04.in", "ans_file": "04.ans", "type_param": "3", "signature": "d"}, + {"in_file": "05.in", "ans_file": "05.ans", "type_param": "4", "signature": "e"}, + ], + } + with open( + os.path.join(tmpdir, ".autocode_tests_manifest.json"), + "w", + encoding="utf-8", + ) as f: + json.dump(manifest, f) + + result = tool._check_limit_ratio(tmpdir) + assert result["passed"] is False + assert result["limit_case_count"] == 2 + assert result["limit_case_minimum_required"] == 3 + + +@pytest.mark.asyncio +async def test_problem_verify_tests_default_enables_limit_ratio(): + """默认会启用 limit_ratio(即使 verify_types 未显式包含)。""" + tool = ProblemVerifyTestsTool() + + with tempfile.TemporaryDirectory() as tmpdir: + for name in ["01.in", "01.ans"]: + with open(os.path.join(tmpdir, name), "w", encoding="utf-8") as f: + f.write("1\n") + + result = await tool.execute( + problem_dir=tmpdir, + tests_dir=tmpdir, + verify_types=["file_count", "no_empty"], # 不包含 limit_ratio + ) + assert not result.success + assert result.data.get("limit_ratio_enabled") is True + assert "limit_ratio" in result.data.get("results", {}) + + +@pytest.mark.asyncio +async def test_problem_verify_tests_can_disable_limit_ratio(): + """允许显式关闭 limit_ratio,默认其他检查正常执行。""" + tool = ProblemVerifyTestsTool() + + with tempfile.TemporaryDirectory() as tmpdir: + for name in ["01.in", "01.ans"]: + with open(os.path.join(tmpdir, name), "w", encoding="utf-8") as f: + f.write("1\n") + + result = await tool.execute( + problem_dir=tmpdir, + tests_dir=tmpdir, + verify_types=["file_count", "no_empty"], + enable_limit_ratio=False, + ) + assert result.success + assert result.data.get("limit_ratio_enabled") is False + assert "limit_ratio" not in result.data.get("results", {}) + + @pytest.mark.asyncio async def test_problem_pack_polygon_dynamic_test_count(): """测试 Polygon 打包使用动态 test-count。""" @@ -531,6 +645,50 @@ async def test_problem_generate_tests_dedup(): assert len(result.data.get("generated_tests", [])) == 1 +def test_balance_and_sample_at_least_half_extreme_or_tle(): + """最终采样中 type 3/4 不少于一半(候选充足时)。""" + tool = ProblemGenerateTestsTool() + + def mk(type_param: str, sig: str) -> CandidateTest: + return CandidateTest( + input_data=f"{type_param}-{sig}", + output_data="o", + type_param=type_param, + signature=sig, + ) + + candidates = ( + [mk("1", f"a{i}") for i in range(10)] + + [mk("2", f"b{i}") for i in range(10)] + + [mk("3", f"c{i}") for i in range(10)] + + [mk("4", f"d{i}") for i in range(10)] + ) + + out = tool._balance_and_sample(candidates, 10, balance_remainder=True) + assert len(out) == 10 + assert sum(1 for x in out if x.type_param in ("3", "4")) >= 5 + + out11 = tool._balance_and_sample(candidates, 11, balance_remainder=True) + assert len(out11) == 11 + assert sum(1 for x in out11 if x.type_param in ("3", "4")) >= 6 + + +def test_balance_and_sample_keeps_duplicates_when_dedup_disabled(): + """采样函数不应按 signature 强制去重(由 enable_dedup 控制前置候选)。""" + tool = ProblemGenerateTestsTool() + + dup1 = CandidateTest("in-a", "out", "3", "same") + dup2 = CandidateTest("in-b", "out", "3", "same") + dup3 = CandidateTest("in-c", "out", "2", "same") + dup4 = CandidateTest("in-d", "out", "1", "same") + candidates = [dup1, dup2, dup3, dup4] + + out = tool._balance_and_sample(candidates, 4, balance_remainder=False) + assert len(out) == 4 + assert out.count(dup1) == 1 + assert out.count(dup2) == 1 + + @pytest.mark.asyncio async def test_problem_generate_tests_balance(): """测试平衡分布功能。""" @@ -590,6 +748,8 @@ async def test_problem_generate_tests_balance(): assert result.success assert result.data.get("balance_enabled") is True + assert result.data.get("limit_case_quota_met") is True + assert result.data.get("limit_case_count", 0) >= 4 # 8 条中至少 4 条为 extreme/tle # 检查类型分布 type_dist = result.data.get("type_distribution", {}) # 应该有 4 种类型,每种 2 个 diff --git a/uv.lock b/uv.lock index df8efbd..6301f9b 100644 --- a/uv.lock +++ b/uv.lock @@ -36,7 +36,7 @@ wheels = [ [[package]] name = "autocode-mcp" -version = "0.7.0" +version = "0.8.0" source = { editable = "." } dependencies = [ { name = "mcp" },