diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..7d275df --- /dev/null +++ b/.env.example @@ -0,0 +1,37 @@ +# ── Bittensor ── +NETUID=XX # Your subnet UID (assigned after registration) +SUBTENSOR_NETWORK=finney # finney | test | local +SUBTENSOR_CHAIN_ENDPOINT= # Custom endpoint (optional) + +# ── Wallet ── +WALLET_NAME=my_wallet +WALLET_HOTKEY=default + +# ── Miner ── +MINER_BACKEND=openai # openai | anthropic | local | agent +MINER_MODEL=gpt-4o # Model identifier +OPENAI_API_KEY=sk-... # If using OpenAI backend +ANTHROPIC_API_KEY=sk-ant-... # If using Anthropic backend +MINER_PORT=8091 +MINER_MAX_CONCURRENT=4 + +# ── Validator ── +VALIDATOR_PORT=8092 +VALIDATOR_EPOCH_LENGTH=360 +VALIDATOR_TASKS_PER_EPOCH=12 +VALIDATOR_TRAP_RATE=0.15 +VALIDATOR_TIMEOUT=300 +VALIDATOR_SANDBOX_ENABLED=true +VALIDATOR_LEAN4_ENABLED=true +VALIDATOR_EMBEDDING_MODEL=all-MiniLM-L6-v2 + +# ── Gateway ── +GATEWAY_PORT=8000 +GATEWAY_API_KEY_SECRET=your-secret-key + +# ── Monitoring ── +PROMETHEUS_PORT=9090 +GRAFANA_PASSWORD=admin + +# ── State ── +STATE_DB_PATH=state/reasonforge.db diff --git a/.github/workflows/build-docker.yml b/.github/workflows/build-docker.yml new file mode 100644 index 0000000..0845563 --- /dev/null +++ b/.github/workflows/build-docker.yml @@ -0,0 +1,65 @@ +name: Build Docker Images + +on: + push: + tags: + - "v*" + +env: + REGISTRY: ghcr.io + IMAGE_PREFIX: ${{ github.repository_owner }}/reasonforge + +jobs: + build: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + + strategy: + fail-fast: false + matrix: + include: + - image: miner + dockerfile: docker/Dockerfile.miner + - image: validator + dockerfile: docker/Dockerfile.validator + - image: gateway + dockerfile: docker/Dockerfile.gateway + - image: sandbox + dockerfile: docker/Dockerfile.sandbox + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Log in to Container Registry + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Extract metadata + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_PREFIX }}-${{ matrix.image }} + tags: | + type=semver,pattern={{version}} + type=semver,pattern={{major}}.{{minor}} + type=sha + + - name: Build and push + uses: docker/build-push-action@v5 + with: + context: . + file: ${{ matrix.dockerfile }} + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + cache-from: type=gha + cache-to: type=gha,mode=max diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 0000000..25656ca --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,56 @@ +name: Lint + +on: + push: + branches: [main, develop] + pull_request: + branches: [main, develop] + +jobs: + ruff: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Python 3.12 + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install ruff + run: pip install ruff + + - name: Run ruff check + run: ruff check . + + - name: Run ruff format check + run: ruff format --check . + + mypy: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Python 3.12 + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Cache pip dependencies + uses: actions/cache@v4 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-pip-mypy-${{ hashFiles('requirements*.txt') }} + restore-keys: | + ${{ runner.os }}-pip-mypy- + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + pip install mypy + + - name: Run mypy + run: mypy reasonforge/ --ignore-missing-imports diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..6ab561c --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,49 @@ +name: Tests + +on: + push: + branches: [main, develop] + pull_request: + branches: [main, develop] + +jobs: + test: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.10", "3.11", "3.12"] + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Cache pip dependencies + uses: actions/cache@v4 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-pip-${{ matrix.python-version }}-${{ hashFiles('requirements*.txt') }} + restore-keys: | + ${{ runner.os }}-pip-${{ matrix.python-version }}- + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + pip install pytest pytest-cov pytest-asyncio + + - name: Run tests + run: | + pytest tests/ -v --tb=short --cov=reasonforge --cov-report=xml --cov-report=term-missing + + - name: Upload coverage + if: matrix.python-version == '3.12' + uses: actions/upload-artifact@v4 + with: + name: coverage-report + path: coverage.xml diff --git a/.gitignore b/.gitignore index 1a9b311..eb7a092 100644 --- a/.gitignore +++ b/.gitignore @@ -45,6 +45,16 @@ build/ # Simulation output simulation/*.json +# State (top-level state directory, not reasonforge/state/) +/state/*.db +/state/ + # IDE .vscode/ .idea/ + +# Docker +docker/.env + +# Wallets (never commit) +.bittensor/ diff --git a/PLAN_PRODUCTION.md b/PLAN_PRODUCTION.md new file mode 100644 index 0000000..ed7f88e --- /dev/null +++ b/PLAN_PRODUCTION.md @@ -0,0 +1,2391 @@ +# ReasonForge — Production Subnet Build Plan + +> **Purpose**: Upgrade the existing MVP (simulation-only codebase) into a production-ready Bittensor subnet deployable to testnet and mainnet. This plan assumes the MVP is already built and working — all 13 whitepaper formulas implemented, simulator passing tests, CLI running multi-epoch simulations. +> +> **Prerequisite**: The MVP codebase from `PLAN.md` must be complete and passing all tests before starting this plan. +> +> **Target**: Bittensor SDK v10 compatible subnet with real miners, validators, on-chain weight setting, formal verification, sandboxed code execution, embedding-based plagiarism detection, persistent state, API gateway, monitoring, Docker deployment, and CI/CD. + +--- + +## Table of Contents + +1. [Architecture Delta: MVP → Production](#1-architecture-delta) +2. [Directory Structure](#2-directory-structure) +3. [Phase 1: Bittensor Protocol Layer](#phase-1) +4. [Phase 2: Real Miner Neuron](#phase-2) +5. [Phase 3: Real Validator Neuron](#phase-3) +6. [Phase 4: Formal Verification & Sandboxed Execution](#phase-4) +7. [Phase 5: Embedding-Based Plagiarism Detection](#phase-5) +8. [Phase 6: Task Sourcing & Benchmark Database](#phase-6) +9. [Phase 7: Persistent State & Recovery](#phase-7) +10. [Phase 8: API Gateway & External Access](#phase-8) +11. [Phase 9: Monitoring & Observability](#phase-9) +12. [Phase 10: Security Hardening](#phase-10) +13. [Phase 11: Docker & Deployment](#phase-11) +14. [Phase 12: CI/CD & Testing Infrastructure](#phase-12) +15. [Phase 13: Documentation & SDK](#phase-13) +16. [Subnet Hyperparameters](#subnet-hyperparameters) +17. [Environment Variables](#environment-variables) +18. [Build Order](#build-order) +19. [Success Criteria](#success-criteria) + +--- + +## 1. Architecture Delta: MVP → Production + +### What the MVP has (keep everything): +``` +reasonforge/ +├── types.py → KEEP: All dataclasses, constants, enums +├── engine.py → KEEP: All 13 whitepaper formulas (stateless, tested) +├── simulator.py → KEEP: Used for offline testing + benchmarking +├── plagiarism.py → REPLACE: Upgrade from jaccard to embedding cosine similarity +├── task_generator.py → EXPAND: Add real benchmark DB + API task ingestion +├── run.py → KEEP: CLI simulation still useful for development +├── __init__.py → KEEP +tests/ → EXPAND: Add integration tests for real neurons +api/server.py → EXPAND: Add auth, rate limiting, task submission +dashboard/App.jsx → KEEP: Add live network stats panel +``` + +### What production adds: +``` +NEW: reasonforge/protocol.py — Bittensor Synapse definitions (wire protocol) +NEW: reasonforge/base/ — Base neuron class (wallet, subtensor, metagraph) +NEW: neurons/miner.py — Real miner neuron (Axon server, LLM reasoning) +NEW: neurons/validator.py — Real validator neuron (Dendrite client, scoring, weight setting) +NEW: reasonforge/verification/ — Lean4 proof checker, code sandbox, fact checker +NEW: reasonforge/embeddings/ — Sentence-transformer embedding + cosine similarity +NEW: reasonforge/state/ — SQLite persistence, checkpoint/recovery +NEW: reasonforge/gateway/ — External API gateway (auth, billing, rate limits) +NEW: reasonforge/monitoring/ — Prometheus metrics, structured logging +NEW: reasonforge/security/ — Input sanitization, DoS protection, rate limiting +NEW: scripts/ — Deployment, wallet setup, registration +NEW: docker/ — Dockerfiles for miner, validator, gateway +NEW: .github/workflows/ — CI/CD pipelines +``` + +### Key Architectural Decisions: +1. **MVP engine.py is the single source of truth** — production neurons call the same `ScoringEngine` methods. No formula reimplementation. +2. **Validators own the scoring loop** — miners just solve tasks and return reasoning chains. +3. **Validators set on-chain weights** — computed from S_epoch scores, submitted via `subtensor.set_weights()`. +4. **State is local-first** — SQLite for persistence, no external DB dependency. +5. **Miners are model-agnostic** — any LLM backend (local, API, agent framework) plugs in via a standard interface. + +--- + +## 2. Directory Structure + +``` +reasonforge/ +├── README.md +├── PLAN.md # MVP plan (reference) +├── PLAN_PRODUCTION.md # This file +├── pyproject.toml +├── requirements.txt # Core deps +├── requirements-miner.txt # Miner-specific deps (torch, transformers) +├── requirements-validator.txt # Validator-specific deps (lean4, docker) +├── requirements-dev.txt # Dev deps (pytest, mypy, ruff) +├── .env.example # Environment variable template +├── .gitignore +│ +├── reasonforge/ # Core package (MVP + production extensions) +│ ├── __init__.py +│ ├── types.py # [MVP] Constants, dataclasses, enums +│ ├── engine.py # [MVP] All 13 whitepaper formulas +│ ├── simulator.py # [MVP] Epoch simulator (offline testing) +│ ├── task_generator.py # [MVP→EXPANDED] + benchmark DB + API ingestion +│ ├── run.py # [MVP] CLI runner +│ │ +│ ├── protocol.py # [NEW] Bittensor Synapse definitions +│ │ +│ ├── base/ # [NEW] Base neuron infrastructure +│ │ ├── __init__.py +│ │ ├── neuron.py # BaseNeuron: wallet, subtensor, metagraph, registration +│ │ └── config.py # CLI argument parsing, config management +│ │ +│ ├── miner/ # [NEW] Miner-side modules +│ │ ├── __init__.py +│ │ ├── reasoning.py # ReasoningEngine interface + implementations +│ │ ├── backends/ # LLM backend adapters +│ │ │ ├── __init__.py +│ │ │ ├── base.py # Abstract LLMBackend class +│ │ │ ├── openai_backend.py # OpenAI/compatible API backend +│ │ │ ├── anthropic_backend.py # Anthropic API backend +│ │ │ ├── local_backend.py # Local transformers/vLLM backend +│ │ │ └── agent_backend.py # LangGraph/CrewAI agent backend +│ │ ├── domain_router.py # Route tasks to domain-specialized prompts +│ │ └── proof_generator.py # Generate formal proof fragments +│ │ +│ ├── validator/ # [NEW] Validator-side modules +│ │ ├── __init__.py +│ │ ├── scoring.py # Orchestrates full scoring pipeline +│ │ ├── objective_scorer.py # Domain-specific automated checks +│ │ ├── consensus.py # Stake-weighted trimmed median +│ │ ├── weight_setter.py # On-chain weight computation + submission +│ │ ├── task_manager.py # Task queue, dispatch, assignment +│ │ └── trap_manager.py # Trap problem injection + tracking +│ │ +│ ├── verification/ # [NEW] Formal verification backends +│ │ ├── __init__.py +│ │ ├── lean4_checker.py # Lean 4 proof verification +│ │ ├── code_sandbox.py # Docker-isolated code execution +│ │ ├── math_checker.py # SymPy numerical/symbolic verification +│ │ └── fact_checker.py # Citation and factual claim verification +│ │ +│ ├── embeddings/ # [NEW] Embedding-based similarity +│ │ ├── __init__.py +│ │ └── similarity.py # Sentence-transformer cosine similarity +│ │ +│ ├── state/ # [NEW] Persistence layer +│ │ ├── __init__.py +│ │ ├── database.py # SQLite schema + CRUD operations +│ │ ├── checkpoint.py # State serialization/recovery +│ │ └── migrations.py # Schema versioning +│ │ +│ ├── gateway/ # [NEW] External API gateway +│ │ ├── __init__.py +│ │ ├── app.py # FastAPI app (public-facing) +│ │ ├── auth.py # API key management, JWT +│ │ ├── billing.py # Usage tracking, quotas +│ │ ├── rate_limiter.py # Token-bucket rate limiting +│ │ └── schemas.py # Request/response Pydantic models +│ │ +│ ├── monitoring/ # [NEW] Observability +│ │ ├── __init__.py +│ │ ├── metrics.py # Prometheus counters/histograms/gauges +│ │ ├── logger.py # Structured JSON logging +│ │ └── health.py # Health check endpoints +│ │ +│ └── security/ # [NEW] Security utilities +│ ├── __init__.py +│ ├── sanitizer.py # Input validation, injection prevention +│ ├── rate_guard.py # Per-UID rate limiting for validators +│ └── anomaly.py # Anomaly detection in miner behavior +│ +├── neurons/ # [NEW] Neuron entry points +│ ├── miner.py # Miner neuron entry point +│ └── validator.py # Validator neuron entry point +│ +├── benchmarks/ # [NEW] Benchmark task database +│ ├── README.md +│ ├── mathematics/ # Domain-specific benchmark sets +│ │ ├── algebra.json +│ │ ├── calculus.json +│ │ ├── number_theory.json +│ │ └── combinatorics.json +│ ├── code/ +│ │ ├── algorithms.json +│ │ ├── systems.json +│ │ └── debugging.json +│ ├── scientific/ +│ │ ├── physics.json +│ │ ├── chemistry.json +│ │ └── biology.json +│ ├── strategic/ +│ │ ├── game_theory.json +│ │ ├── optimization.json +│ │ └── planning.json +│ ├── causal/ +│ │ ├── inference.json +│ │ └── counterfactual.json +│ └── ethical/ +│ ├── dilemmas.json +│ └── policy_analysis.json +│ +├── api/ # [MVP→EXPANDED] Internal simulation API +│ ├── __init__.py +│ └── server.py +│ +├── dashboard/ # [MVP→EXPANDED] React dashboard +│ └── App.jsx +│ +├── tests/ # [MVP→EXPANDED] Full test suite +│ ├── test_engine.py # [MVP] Formula unit tests +│ ├── test_simulator.py # [MVP] Simulation integration tests +│ ├── test_types.py # [MVP] Type/constant tests +│ ├── test_protocol.py # [NEW] Synapse serialization tests +│ ├── test_scoring.py # [NEW] Validator scoring pipeline +│ ├── test_verification.py # [NEW] Lean4, sandbox, math checker +│ ├── test_embeddings.py # [NEW] Similarity detection tests +│ ├── test_state.py # [NEW] Persistence + recovery +│ ├── test_gateway.py # [NEW] API gateway tests +│ ├── test_security.py # [NEW] Input sanitization tests +│ ├── test_integration_local.py # [NEW] Full miner↔validator on localnet +│ └── conftest.py # Shared fixtures +│ +├── scripts/ # [NEW] Deployment & operations +│ ├── setup_wallets.sh # Create owner/miner/validator wallets +│ ├── register_subnet.sh # Register subnet on testnet/mainnet +│ ├── register_neurons.sh # Register miner/validator UIDs +│ ├── stake.sh # Stake TAO to validator +│ ├── run_localnet.sh # Start local subtensor for development +│ ├── benchmark_import.py # Import benchmark tasks into DB +│ ├── generate_traps.py # Generate trap problems with ground truth +│ └── health_check.py # Production health verification +│ +├── docker/ # [NEW] Container definitions +│ ├── Dockerfile.miner # Miner container +│ ├── Dockerfile.validator # Validator container +│ ├── Dockerfile.gateway # API gateway container +│ ├── Dockerfile.sandbox # Isolated code execution container +│ ├── docker-compose.yml # Full stack (miner + validator + gateway + monitoring) +│ ├── docker-compose.localnet.yml # Local development stack +│ └── docker-compose.monitoring.yml # Prometheus + Grafana stack +│ +├── monitoring/ # [NEW] Monitoring configs +│ ├── prometheus.yml # Prometheus scrape config +│ ├── grafana/ +│ │ └── dashboards/ +│ │ ├── subnet_overview.json # Grafana dashboard: subnet metrics +│ │ └── miner_performance.json # Grafana dashboard: per-miner stats +│ └── alerts/ +│ └── rules.yml # Alerting rules +│ +├── docs/ # [MVP→EXPANDED] Documentation +│ ├── ARCHITECTURE.md # System architecture +│ ├── PROTOCOL.md # Wire protocol specification +│ ├── MINER_GUIDE.md # How to run a miner +│ ├── VALIDATOR_GUIDE.md # How to run a validator +│ ├── API_REFERENCE.md # External API docs +│ ├── DEPLOYMENT.md # Production deployment guide +│ ├── SECURITY.md # Security model & threat analysis +│ └── BENCHMARKS.md # Benchmark format & contribution guide +│ +├── .github/ # [NEW] CI/CD +│ └── workflows/ +│ ├── test.yml # Run tests on PR +│ ├── lint.yml # Ruff + mypy +│ ├── build-docker.yml # Build & push containers +│ └── release.yml # Tag-based release +│ +└── min_compute.yml # [NEW] Bittensor minimum compute requirements +``` + +--- + +## Phase 1: Bittensor Protocol Layer + +### 1.1 — protocol.py (Wire Protocol) + +Define all Synapse subclasses for validator↔miner communication. Every Synapse inherits from `bt.Synapse`. + +```python +import bittensor as bt +from typing import Optional, List +from pydantic import Field + +class ReasoningTask(bt.Synapse): + """Validator → Miner: Here is a reasoning task to solve.""" + + # ── Immutable fields (set by validator, read by miner) ── + task_id: str # UUID + problem: str # Natural language problem statement + domain: str # "mathematics"|"code"|"scientific"|"strategic"|"causal"|"ethical" + difficulty: int = Field(ge=1, le=10) # Difficulty level + timeout_seconds: int = 300 # Max time to solve + context: Optional[str] = None # Additional context/data + constraints: Optional[str] = None # Specific constraints + + # ── Mutable fields (filled by miner, read back by validator) ── + reasoning_steps: Optional[List[dict]] = None # List of {step_id, reasoning, evidence, confidence} + final_answer: Optional[str] = None # Final answer text + proof_status: Optional[str] = None # "VERIFIED"|"FAILED"|None + proof_artifact: Optional[str] = None # Base64 encoded proof file (Lean4, Coq) + code_artifact: Optional[str] = None # Code solution if applicable + time_taken_ms: Optional[int] = None # Self-reported solve time + submission_hash: Optional[str] = None # SHA-256 of steps+answer for integrity + + # Required by Bittensor + required_hash_fields: List[str] = ["task_id", "problem", "domain", "difficulty"] + + def deserialize(self) -> dict: + """Deserialize response into scoreable format.""" + return { + "task_id": self.task_id, + "steps": self.reasoning_steps or [], + "final_answer": self.final_answer, + "proof_status": self.proof_status, + "proof_artifact": self.proof_artifact, + "code_artifact": self.code_artifact, + "time_taken_ms": self.time_taken_ms, + "submission_hash": self.submission_hash, + } + + +class HealthCheck(bt.Synapse): + """Validator → Miner: Are you alive and what are your capabilities?""" + + # Mutable + status: Optional[str] = None # "ready"|"busy"|"warming_up" + supported_domains: Optional[List[str]] = None # Which domains this miner supports + model_info: Optional[str] = None # Model identifier (optional, for transparency) + version: Optional[str] = None # Miner software version + + required_hash_fields: List[str] = [] + + def deserialize(self) -> dict: + return { + "status": self.status, + "supported_domains": self.supported_domains, + "model_info": self.model_info, + "version": self.version, + } + + +class TaskResult(bt.Synapse): + """Validator → Miner: Here are your scores for a batch of tasks (informational).""" + + # Immutable + epoch_id: int + miner_uid: int + scores: Optional[List[dict]] = None # [{task_id, cms, rank}] + s_epoch: Optional[float] = None + rank: Optional[int] = None + total_tao: Optional[float] = None + + required_hash_fields: List[str] = ["epoch_id", "miner_uid"] + + def deserialize(self) -> dict: + return { + "epoch_id": self.epoch_id, + "scores": self.scores, + "s_epoch": self.s_epoch, + "rank": self.rank, + } +``` + +**Rules:** +- `required_hash_fields` contains only the fields the validator sets (immutable). Miner cannot tamper with these. +- Mutable fields use `Optional[...] = None` — the miner fills them in. +- Synapses are Pydantic models under the hood — they auto-serialize/deserialize for transit. +- Keep Synapse payloads under 1MB. For large proof artifacts, use base64 encoding with compression. +- The `submission_hash` field lets validators verify integrity: `SHA256(json.dumps(steps) + final_answer)`. + +### 1.2 — base/neuron.py (Base Neuron Class) + +All neurons (miner and validator) inherit from this base: + +```python +class BaseNeuron: + """Shared infrastructure for miners and validators.""" + + neuron_type: str # "miner" or "validator" + + def __init__(self, config=None): + # 1. Parse CLI args / config + self.config = config or self.get_config() + + # 2. Initialize Bittensor objects + self.wallet = bt.Wallet(config=self.config) + self.subtensor = bt.Subtensor(config=self.config) + self.metagraph = self.subtensor.metagraph(netuid=self.config.netuid) + + # 3. Check registration + self.uid = self.get_uid() + if self.uid is None: + bt.logging.error("Neuron not registered. Run: btcli register") + exit(1) + + # 4. Initialize state persistence + self.state_db = StateDatabase( + db_path=f"state/{self.neuron_type}_{self.uid}.db" + ) + + # 5. Initialize metrics + self.metrics = MetricsCollector(neuron_type=self.neuron_type, uid=self.uid) + + # 6. Load previous state if exists + self.load_state() + + @staticmethod + def get_config() -> bt.Config: + parser = argparse.ArgumentParser() + parser.add_argument("--netuid", type=int, required=True) + parser.add_argument("--subtensor.network", type=str, default="finney") + parser.add_argument("--subtensor.chain_endpoint", type=str, default=None) + parser.add_argument("--logging.debug", action="store_true") + bt.Wallet.add_args(parser) + bt.Subtensor.add_args(parser) + bt.logging.add_args(parser) + return bt.Config(parser) + + def get_uid(self) -> Optional[int]: + """Find our UID in the metagraph.""" + hotkey = self.wallet.hotkey.ss58_address + if hotkey in self.metagraph.hotkeys: + return self.metagraph.hotkeys.index(hotkey) + return None + + def sync(self): + """Re-sync metagraph from chain.""" + self.metagraph.sync(subtensor=self.subtensor) + + def should_sync_metagraph(self) -> bool: + """Sync every 5 blocks (60 seconds).""" + current_block = self.subtensor.get_current_block() + return (current_block - self.last_sync_block) >= 5 + + def save_state(self): + """Persist neuron state to SQLite.""" + self.state_db.save_checkpoint(self.get_state_dict()) + + def load_state(self): + """Restore from last checkpoint.""" + state = self.state_db.load_latest_checkpoint() + if state: + self.restore_state_dict(state) + + @abstractmethod + def get_state_dict(self) -> dict: ... + + @abstractmethod + def restore_state_dict(self, state: dict): ... + + @abstractmethod + def run(self): ... +``` + +### 1.3 — base/config.py + +Extend CLI argument parsing for miner-specific and validator-specific flags: + +```python +class MinerConfig: + """Additional args for miner neurons.""" + @staticmethod + def add_args(parser): + parser.add_argument("--miner.backend", type=str, default="openai", + choices=["openai", "anthropic", "local", "agent"]) + parser.add_argument("--miner.model", type=str, default="gpt-4o") + parser.add_argument("--miner.api_key_env", type=str, default="OPENAI_API_KEY") + parser.add_argument("--miner.max_concurrent", type=int, default=4) + parser.add_argument("--miner.port", type=int, default=8091) + parser.add_argument("--miner.domains", type=str, nargs="+", + default=["mathematics", "code", "scientific", "strategic", "causal", "ethical"]) + +class ValidatorConfig: + """Additional args for validator neurons.""" + @staticmethod + def add_args(parser): + parser.add_argument("--validator.epoch_length", type=int, default=360, + help="Blocks per epoch (360 = ~72 min)") + parser.add_argument("--validator.tasks_per_epoch", type=int, default=12) + parser.add_argument("--validator.trap_rate", type=float, default=0.15) + parser.add_argument("--validator.timeout", type=int, default=300) + parser.add_argument("--validator.sample_size", type=int, default=16, + help="Number of miners to query per task") + parser.add_argument("--validator.port", type=int, default=8092) + parser.add_argument("--validator.sandbox_enabled", action="store_true") + parser.add_argument("--validator.lean4_enabled", action="store_true") + parser.add_argument("--validator.embedding_model", type=str, + default="all-MiniLM-L6-v2") +``` + +--- + +## Phase 2: Real Miner Neuron + +### 2.1 — neurons/miner.py (Entry Point) + +The miner neuron: +1. Registers an **Axon** server +2. Attaches handler functions for each Synapse type +3. Serves continuously, responding to validator queries + +```python +class ReasonForgeMiner(BaseNeuron): + neuron_type = "miner" + + def __init__(self, config=None): + super().__init__(config) + + # Initialize LLM backend based on config + self.reasoning_engine = ReasoningEngine( + backend=self.config.miner.backend, + model=self.config.miner.model, + domains=self.config.miner.domains, + ) + + # Create Axon server + self.axon = bt.axon(wallet=self.wallet, config=self.config) + + # Attach handlers + self.axon.attach( + forward_fn=self.handle_reasoning_task, + blacklist_fn=self.blacklist_reasoning_task, + priority_fn=self.priority_reasoning_task, + ).attach( + forward_fn=self.handle_health_check, + ).attach( + forward_fn=self.handle_task_result, + ) + + async def handle_reasoning_task(self, synapse: ReasoningTask) -> ReasoningTask: + """Core handler: receive task, produce reasoning chain, return.""" + start_time = time.time_ns() + + try: + # Route to domain-specific prompt template + prompt = self.reasoning_engine.domain_router.build_prompt(synapse) + + # Execute multi-step reasoning + result = await self.reasoning_engine.solve( + problem=synapse.problem, + domain=synapse.domain, + difficulty=synapse.difficulty, + context=synapse.context, + constraints=synapse.constraints, + timeout=synapse.timeout_seconds, + ) + + # Fill mutable Synapse fields + synapse.reasoning_steps = [ + { + "step_id": i, + "reasoning": step.reasoning, + "evidence": step.evidence, + "confidence": step.confidence, + "formal_proof_fragment": step.formal_proof_fragment, + } + for i, step in enumerate(result.steps) + ] + synapse.final_answer = result.final_answer + synapse.proof_status = result.proof_status + synapse.proof_artifact = result.proof_artifact + synapse.code_artifact = result.code_artifact + synapse.time_taken_ms = int((time.time_ns() - start_time) / 1_000_000) + synapse.submission_hash = self._compute_hash(synapse) + + except Exception as e: + bt.logging.error(f"Task {synapse.task_id} failed: {e}") + synapse.final_answer = f"ERROR: {str(e)}" + synapse.reasoning_steps = [] + + return synapse + + def blacklist_reasoning_task(self, synapse: ReasoningTask) -> tuple[bool, str]: + """Reject requests from non-validators or unregistered neurons.""" + caller_hotkey = synapse.dendrite.hotkey + if caller_hotkey not in self.metagraph.hotkeys: + return True, "Unregistered hotkey" + caller_uid = self.metagraph.hotkeys.index(caller_hotkey) + if not self.metagraph.validator_permit[caller_uid]: + return True, "No validator permit" + return False, "" + + def priority_reasoning_task(self, synapse: ReasoningTask) -> float: + """Higher-stake validators get priority.""" + caller_hotkey = synapse.dendrite.hotkey + caller_uid = self.metagraph.hotkeys.index(caller_hotkey) + return float(self.metagraph.S[caller_uid]) + + def run(self): + """Main loop.""" + bt.logging.info(f"Miner starting on UID {self.uid}") + self.axon.serve(netuid=self.config.netuid, subtensor=self.subtensor) + self.axon.start() + + while True: + if self.should_sync_metagraph(): + self.sync() + time.sleep(12) # One block +``` + +### 2.2 — miner/reasoning.py (Reasoning Engine) + +Abstract interface + orchestrator that routes to the correct backend: + +```python +class ReasoningEngine: + def __init__(self, backend: str, model: str, domains: list[str]): + self.backend = self._create_backend(backend, model) + self.domain_router = DomainRouter(domains) + + async def solve(self, problem, domain, difficulty, context, constraints, timeout) -> ReasoningResult: + """ + Execute multi-step reasoning: + 1. Build domain-specific system prompt + 2. Request chain-of-thought from LLM + 3. Parse structured reasoning steps + 4. Attempt formal proof generation (math/code domains) + 5. Return structured result + """ + ... + +@dataclass +class ReasoningResult: + steps: list[ReasoningStep] + final_answer: str + proof_status: Optional[str] + proof_artifact: Optional[str] + code_artifact: Optional[str] +``` + +### 2.3 — miner/backends/base.py (LLM Backend Interface) + +```python +class LLMBackend(ABC): + @abstractmethod + async def generate(self, messages: list[dict], temperature: float, + max_tokens: int, timeout: int) -> str: ... + + @abstractmethod + async def generate_structured(self, messages: list[dict], + schema: dict, timeout: int) -> dict: ... +``` + +**Implementations required:** +- `openai_backend.py` — OpenAI/compatible (GPT-4o, DeepSeek, local vLLM) +- `anthropic_backend.py` — Anthropic (Claude Sonnet/Opus) +- `local_backend.py` — HuggingFace transformers, direct GPU inference +- `agent_backend.py` — LangGraph/CrewAI multi-agent reasoning + +### 2.4 — miner/domain_router.py + +Maps domains to specialized system prompts and output parsers: + +```python +DOMAIN_PROMPTS = { + Domain.MATHEMATICS: """You are a mathematical reasoning engine. For each step: + 1. State your approach + 2. Show formal work + 3. Verify the step + If possible, express proofs in Lean 4 syntax. + Output: structured JSON with steps array.""", + + Domain.CODE: """You are a code reasoning engine. For each step: + 1. Analyze requirements + 2. Design solution approach + 3. Implement with test cases + Output includes executable code artifact.""", + + # ... (all 6 domains) +} +``` + +--- + +## Phase 3: Real Validator Neuron + +### 3.1 — neurons/validator.py (Entry Point) + +The validator neuron runs the **main epoch loop**: + +```python +class ReasonForgeValidator(BaseNeuron): + neuron_type = "validator" + + def __init__(self, config=None): + super().__init__(config) + + # Initialize components + self.dendrite = bt.dendrite(wallet=self.wallet) + self.task_manager = TaskManager(config=self.config) + self.trap_manager = TrapManager(trap_rate=self.config.validator.trap_rate) + self.scorer = ValidatorScorer(config=self.config) + self.weight_setter = WeightSetter(subtensor=self.subtensor, config=self.config) + self.similarity_detector = SimilarityDetector( + model_name=self.config.validator.embedding_model + ) + + # State tracking (per-epoch) + self.miner_states: dict[int, MinerState] = {} # uid → MinerState + self.epoch_id: int = 0 + self.scores: torch.FloatTensor = torch.zeros(256) # Weight vector + + def run(self): + """Main validator loop.""" + bt.logging.info(f"Validator starting on UID {self.uid}") + + while True: + try: + # 1. Sync metagraph + self.sync() + + # 2. Check if epoch boundary + current_block = self.subtensor.get_current_block() + if self.is_epoch_boundary(current_block): + self.run_epoch() + + # 3. Sleep for one block + time.sleep(12) + + except Exception as e: + bt.logging.error(f"Validator loop error: {e}") + traceback.print_exc() + time.sleep(12) + + def run_epoch(self): + """Execute one complete scoring epoch.""" + self.epoch_id += 1 + bt.logging.info(f"=== EPOCH {self.epoch_id} ===") + + # Phase A: Generate tasks for this epoch + tasks = self.task_manager.generate_epoch_tasks( + count=self.config.validator.tasks_per_epoch, + trap_rate=self.config.validator.trap_rate, + ) + + # Phase B: For each task, query miners and score + all_task_results = [] + for task in tasks: + task_result = asyncio.run(self.process_task(task)) + all_task_results.append(task_result) + + # Phase C: Compute epoch scores using MVP engine + self.compute_epoch_scores(all_task_results) + + # Phase D: Compute and set on-chain weights + self.set_weights() + + # Phase E: Persist state + self.save_state() + + # Phase F: Send score notifications to miners (informational) + asyncio.run(self.notify_miners()) + + async def process_task(self, task: Task) -> TaskProcessingResult: + """Query miners, collect responses, score them.""" + + # 1. Select miners to query (sample or all) + miner_uids = self.get_queryable_miners() + + # 2. Build Synapse + synapse = ReasoningTask( + task_id=task.task_id, + problem=task.problem, + domain=task.domain.value, + difficulty=task.difficulty, + timeout_seconds=self.config.validator.timeout, + ) + + # 3. Query miners via dendrite + axons = [self.metagraph.axons[uid] for uid in miner_uids] + responses: List[ReasoningTask] = await self.dendrite( + axons=axons, + synapse=synapse, + timeout=self.config.validator.timeout, + ) + + # 4. Score each response + scored_results = [] + for uid, response in zip(miner_uids, responses): + # 4a. Check for timeout/failure + if response.final_answer is None: + scored_results.append((uid, DimensionScores(0, 0, 0, 0))) + continue + + # 4b. Verify submission hash integrity + expected_hash = self._compute_hash(response) + if response.submission_hash != expected_hash: + bt.logging.warning(f"UID {uid}: hash mismatch, penalizing") + scored_results.append((uid, DimensionScores(0, 0, 0, 0))) + continue + + # 4c. Run plagiarism check against other responses + similarity = self.similarity_detector.check_against_batch( + response, [r for r in responses if r != response] + ) + plagiarism_penalty = SIMILARITY_PENALTY if similarity > SIMILARITY_THRESHOLD else 1.0 + + # 4d. Compute objective score (automated checks) + o_score = await self.scorer.compute_objective_score(task, response) + + # 4e. Compute dimension scores + dim_scores = await self.scorer.compute_dimensions(task, response) + + # 4f. Apply plagiarism penalty + dim_scores = DimensionScores( + quality=dim_scores.quality * plagiarism_penalty, + accuracy=dim_scores.accuracy * plagiarism_penalty, + novelty=dim_scores.novelty * plagiarism_penalty, + efficiency=dim_scores.efficiency, + ) + + scored_results.append((uid, dim_scores)) + + # 4g. Track trap scores + if task.is_trap: + self.track_trap_score(uid, dim_scores.cms, task.ground_truth_score) + + return TaskProcessingResult(task=task, scored_results=scored_results) + + def compute_epoch_scores(self, task_results: list): + """Aggregate per-task CMS into S_epoch using MVP engine.""" + for uid in self.get_all_miner_uids(): + miner_state = self.get_or_create_miner_state(uid) + + # Gather CMS scores and difficulty multipliers for this miner + cms_list = [] + diff_mults = [] + for tr in task_results: + for scored_uid, dim_scores in tr.scored_results: + if scored_uid == uid: + cms = ScoringEngine.compute_cms(dim_scores) + cms_list.append(cms) + diff_mults.append(tr.task.difficulty_multiplier) + + if not cms_list: + continue + + # Compute trap penalty (Eq. 9) + trap_penalty = ScoringEngine.compute_trap_penalty(miner_state.trap_scores) + + # Compute S_epoch (Eq. 3) + miner_state.s_epoch = ScoringEngine.compute_s_epoch( + cms_list, diff_mults, trap_penalty + ) + + # Rank miners + ranked = sorted( + [ms for ms in self.miner_states.values() if ms.s_epoch > 0], + key=lambda m: m.s_epoch, reverse=True + ) + for i, ms in enumerate(ranked): + ms.rank = i + 1 + # Update streak + if ms.rank <= PEB_K: + ms.streak += 1 + else: + ms.streak = 0 + # Compute PEB (Eq. 4) + ms.peb = ScoringEngine.compute_peb(ms.rank, ms.streak) + + def set_weights(self): + """Compute normalized weight vector and submit to chain.""" + weights = torch.zeros(self.metagraph.n) + + for uid, ms in self.miner_states.items(): + if uid < len(weights): + # Weight = S_epoch * (1 + PEB) — same as emission formula denominator + weights[uid] = ms.s_epoch * (1.0 + ms.peb) + + # Normalize to sum to 1 + total = weights.sum() + if total > 0: + weights = weights / total + + # Submit to chain + success = self.subtensor.set_weights( + netuid=self.config.netuid, + wallet=self.wallet, + uids=torch.arange(self.metagraph.n), + weights=weights, + ) + + if success: + bt.logging.info(f"Weights set successfully for epoch {self.epoch_id}") + else: + bt.logging.error("Failed to set weights on chain") +``` + +### 3.2 — validator/scoring.py (Full Scoring Pipeline) + +Orchestrates the scoring pipeline using the MVP's `ScoringEngine`: + +```python +class ValidatorScorer: + """ + Wraps the MVP ScoringEngine for production use. + Adds objective verification backends on top of the formula layer. + """ + + def __init__(self, config): + self.engine = ScoringEngine() # MVP engine — all formulas + self.lean4 = Lean4Checker() if config.validator.lean4_enabled else None + self.sandbox = CodeSandbox() if config.validator.sandbox_enabled else None + self.math_checker = MathChecker() + self.fact_checker = FactChecker() + + async def compute_dimensions(self, task: Task, response: ReasoningTask) -> DimensionScores: + """ + Compute all 4 dimension scores for a miner's response. + Maps to the Quality, Accuracy, Novelty, Efficiency dimensions in the whitepaper. + """ + quality = self._score_quality(task, response) + accuracy = await self._score_accuracy(task, response) + novelty = self._score_novelty(task, response) + efficiency = self._score_efficiency(task, response) + return DimensionScores(quality, accuracy, novelty, efficiency) + + def _score_quality(self, task, response) -> float: + """ + Quality (40% of CMS): + - Step coherence: Do steps logically follow each other? + - Completeness: Are all aspects of the problem addressed? + - Depth: Sufficient detail per step? + - Formal proof fragments present? (bonus for math/code) + """ + steps = response.reasoning_steps or [] + if not steps: + return 0.0 + + # Step count vs difficulty expectation + expected_steps = max(3, task.difficulty) + step_ratio = min(1.0, len(steps) / expected_steps) + + # Average confidence + avg_confidence = sum(s.get("confidence", 0) for s in steps) / len(steps) + + # Evidence presence + evidence_ratio = sum(1 for s in steps if s.get("evidence")) / len(steps) + + # Proof fragment bonus + proof_bonus = 0.1 if any(s.get("formal_proof_fragment") for s in steps) else 0.0 + + return min(1.0, (0.3 * step_ratio) + (0.3 * avg_confidence) + (0.2 * evidence_ratio) + (0.2 + proof_bonus)) + + async def _score_accuracy(self, task, response) -> float: + """ + Accuracy (30% of CMS): + Domain-specific automated checks → Eq. 11 objective scoring. + """ + domain = Domain(task.domain) if isinstance(task.domain, str) else task.domain + + if domain == Domain.MATHEMATICS: + checks = {} + if self.lean4 and response.proof_artifact: + checks["proof"] = await self.lean4.verify(response.proof_artifact) + checks["numerical"] = self.math_checker.verify( + task.problem, response.final_answer + ) + checks["steps"] = self._verify_math_steps(response.reasoning_steps) + weights = DOMAIN_CHECK_WEIGHTS[Domain.MATHEMATICS] + + elif domain == Domain.CODE: + checks = {} + if self.sandbox and response.code_artifact: + checks["tests"] = await self.sandbox.run_tests(response.code_artifact) + checks["static_analysis"] = await self.sandbox.lint(response.code_artifact) + checks["formal"] = 0.5 # Default if no sandbox + weights = DOMAIN_CHECK_WEIGHTS[Domain.CODE] + + # ... (other domains) + + return self.engine.compute_objective_score(checks, weights) + + def _score_novelty(self, task, response) -> float: + """ + Novelty (15% of CMS): + - Unique approach vs common solutions + - Creative reasoning paths + - Non-trivial insights + """ + steps = response.reasoning_steps or [] + if not steps: + return 0.0 + + # Heuristic: longer, more varied reasoning → higher novelty + avg_step_length = sum(len(s.get("reasoning", "")) for s in steps) / len(steps) + length_score = min(1.0, avg_step_length / 500) + + # Unique terms ratio (rough diversity measure) + all_words = " ".join(s.get("reasoning", "") for s in steps).split() + diversity = len(set(all_words)) / max(1, len(all_words)) + + return min(1.0, 0.5 * length_score + 0.5 * diversity) + + def _score_efficiency(self, task, response) -> float: + """ + Efficiency (15% of CMS): + - Solve time relative to timeout + - Steps vs difficulty (conciseness) + """ + time_ms = response.time_taken_ms or (task.timeout_seconds * 1000) + timeout_ms = task.timeout_seconds * 1000 + + # Faster = better, but don't reward instant (likely garbage) + time_ratio = time_ms / timeout_ms + if time_ratio < 0.01: # Suspiciously fast + time_score = 0.2 + elif time_ratio > 1.0: # Timed out + time_score = 0.0 + else: + time_score = 1.0 - (time_ratio * 0.5) # Linear penalty, max 0.5 deduction + + return min(1.0, time_score) +``` + +### 3.3 — validator/weight_setter.py + +```python +class WeightSetter: + """Compute and submit on-chain weights from epoch scores.""" + + def __init__(self, subtensor, config): + self.subtensor = subtensor + self.config = config + + def compute_weights(self, miner_states: dict[int, MinerState], n: int) -> tuple: + """ + Convert S_epoch + PEB into normalized weight vector. + This is the core mapping from off-chain scoring → on-chain Yuma Consensus input. + """ + uids = [] + weights = [] + + for uid in range(n): + if uid in miner_states and miner_states[uid].s_epoch > 0: + w = miner_states[uid].s_epoch * (1.0 + miner_states[uid].peb) + uids.append(uid) + weights.append(w) + + if not weights: + return torch.tensor([]), torch.tensor([]) + + # Normalize + weight_tensor = torch.FloatTensor(weights) + weight_tensor = weight_tensor / weight_tensor.sum() + + return torch.tensor(uids), weight_tensor + + def submit(self, uids, weights) -> bool: + """Submit weights to chain with retry logic.""" + max_retries = 3 + for attempt in range(max_retries): + try: + success = self.subtensor.set_weights( + netuid=self.config.netuid, + wallet=self.wallet, + uids=uids, + weights=weights, + wait_for_inclusion=True, + wait_for_finalization=False, + ) + if success: + return True + except Exception as e: + bt.logging.warning(f"Weight setting attempt {attempt+1} failed: {e}") + time.sleep(5) + return False +``` + +### 3.4 — validator/consensus.py + +```python +def compute_consensus_score( + validator_scores: list[tuple[float, float]], # (score, stake) + trim_delta: float = CONSENSUS_TRIM_DELTA, +) -> float: + """ + Stake-weighted trimmed median (Eq. 12). + Reuses ScoringEngine.compute_consensus_score from MVP. + This wrapper adapts production validator data into the format the engine expects. + """ + return ScoringEngine.compute_consensus_score(validator_scores, trim_delta) +``` + +### 3.5 — validator/trap_manager.py + +```python +class TrapManager: + """Inject trap problems with known ground-truth scores.""" + + def __init__(self, trap_rate: float = TRAP_RATE): + self.trap_rate = trap_rate + self.trap_db = self._load_traps() + + def inject_traps(self, tasks: list[Task]) -> list[Task]: + """Replace trap_rate fraction of tasks with traps.""" + n_traps = max(1, int(len(tasks) * self.trap_rate)) + trap_tasks = random.sample(self.trap_db, min(n_traps, len(self.trap_db))) + + # Replace last n_traps tasks + for i in range(n_traps): + tasks[-(i+1)] = trap_tasks[i] + + random.shuffle(tasks) + return tasks + + def evaluate_trap_response(self, task: Task, response: ReasoningTask) -> float: + """Compare response against ground truth. Returns score 0-1.""" + # For math traps: numerical comparison + # For code traps: test case execution + # For others: embedding similarity to known-correct answer + ... +``` + +--- + +## Phase 4: Formal Verification & Sandboxed Execution + +### 4.1 — verification/lean4_checker.py + +```python +class Lean4Checker: + """ + Verify Lean 4 proof artifacts submitted by miners. + Requires: lean4 toolchain installed in validator environment. + """ + + def __init__(self, lean_path: str = "lean"): + self.lean_path = lean_path + self.timeout = 60 # seconds + + async def verify(self, proof_b64: str) -> float: + """ + Decode proof artifact → write to temp .lean file → run lean4 → check exit code. + Returns: 1.0 if proof compiles, 0.0 if it fails, 0.5 if timeout. + """ + proof_text = base64.b64decode(proof_b64).decode("utf-8") + + with tempfile.NamedTemporaryFile(suffix=".lean", mode="w", delete=False) as f: + f.write(proof_text) + f.flush() + + try: + result = await asyncio.wait_for( + asyncio.create_subprocess_exec( + self.lean_path, f.name, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ), + timeout=self.timeout, + ) + stdout, stderr = await result.communicate() + return 1.0 if result.returncode == 0 else 0.0 + except asyncio.TimeoutError: + return 0.5 + finally: + os.unlink(f.name) +``` + +### 4.2 — verification/code_sandbox.py + +```python +class CodeSandbox: + """ + Run miner code submissions in an isolated Docker container. + Prevents: filesystem access, network access, fork bombs, resource exhaustion. + """ + + def __init__(self, image: str = "reasonforge-sandbox:latest"): + self.image = image + self.client = docker.from_env() + self.timeout = 30 + self.memory_limit = "256m" + self.cpu_period = 100000 + self.cpu_quota = 50000 # 50% of one core + + async def run_tests(self, code_b64: str) -> float: + """Execute code in sandbox, run any included test cases.""" + code = base64.b64decode(code_b64).decode("utf-8") + + container = self.client.containers.run( + self.image, + command=["python3", "-c", code], + detach=True, + mem_limit=self.memory_limit, + cpu_period=self.cpu_period, + cpu_quota=self.cpu_quota, + network_disabled=True, + read_only=True, + tmpfs={"/tmp": "size=64m"}, + ) + + try: + result = container.wait(timeout=self.timeout) + logs = container.logs().decode("utf-8") + + if result["StatusCode"] == 0: + # Parse test output for pass/fail counts + return self._parse_test_results(logs) + return 0.0 + except Exception: + return 0.0 + finally: + container.remove(force=True) + + async def lint(self, code_b64: str) -> float: + """Run ruff/pylint on code, return quality score.""" + ... +``` + +### 4.3 — verification/math_checker.py + +```python +class MathChecker: + """Numerical and symbolic verification using SymPy.""" + + def verify(self, problem: str, answer: str) -> float: + """ + Try to: + 1. Parse the answer as a mathematical expression + 2. Evaluate numerically + 3. Compare against independent computation + """ + try: + # Extract numerical value from answer + parsed = sympify(answer) + # For known problem types, verify against computed solution + # Returns 1.0 for correct, 0.0 for incorrect, 0.5 for unverifiable + ... + except: + return 0.5 # Can't verify → neutral score +``` + +### 4.4 — docker/Dockerfile.sandbox + +```dockerfile +FROM python:3.12-slim +RUN pip install --no-cache-dir numpy scipy sympy +RUN useradd -m sandbox +USER sandbox +WORKDIR /tmp +# No network, no filesystem beyond /tmp +``` + +--- + +## Phase 5: Embedding-Based Plagiarism Detection + +### 5.1 — embeddings/similarity.py + +Replace MVP's jaccard similarity with real embedding cosine similarity: + +```python +from sentence_transformers import SentenceTransformer +import numpy as np + +class SimilarityDetector: + """ + Detect plagiarism between miner submissions using sentence embeddings. + Uses: sentence-transformers/all-MiniLM-L6-v2 (fast, 384-dim) + """ + + def __init__(self, model_name: str = "all-MiniLM-L6-v2"): + self.model = SentenceTransformer(model_name) + self.history_embeddings: list[np.ndarray] = [] # Rolling buffer + self.max_history = 5000 + + def embed_submission(self, response: ReasoningTask) -> np.ndarray: + """Encode reasoning chain into a single embedding vector.""" + steps_text = " ".join( + s.get("reasoning", "") for s in (response.reasoning_steps or []) + ) + full_text = f"{steps_text} {response.final_answer or ''}" + return self.model.encode(full_text, normalize_embeddings=True) + + def check_against_batch(self, response: ReasoningTask, + other_responses: list[ReasoningTask]) -> float: + """Return max cosine similarity against other responses in this batch.""" + if not other_responses: + return 0.0 + + target_emb = self.embed_submission(response) + other_embs = np.array([self.embed_submission(r) for r in other_responses]) + + # Cosine similarity (embeddings are normalized, so dot product = cosine) + similarities = other_embs @ target_emb + return float(np.max(similarities)) + + def check_against_history(self, response: ReasoningTask) -> float: + """Check against historical submissions (cross-epoch plagiarism).""" + if not self.history_embeddings: + return 0.0 + + target_emb = self.embed_submission(response) + history_matrix = np.array(self.history_embeddings[-self.max_history:]) + similarities = history_matrix @ target_emb + return float(np.max(similarities)) + + def add_to_history(self, response: ReasoningTask): + """Store embedding for future cross-epoch checks.""" + emb = self.embed_submission(response) + self.history_embeddings.append(emb) + if len(self.history_embeddings) > self.max_history: + self.history_embeddings = self.history_embeddings[-self.max_history:] +``` + +--- + +## Phase 6: Task Sourcing & Benchmark Database + +### 6.1 — Benchmark JSON Format + +Every benchmark file is a JSON array of task objects: + +```json +[ + { + "task_id": "math-algebra-001", + "problem": "Prove that for all positive integers n, the sum 1 + 2 + ... + n = n(n+1)/2", + "domain": "mathematics", + "difficulty": 4, + "timeout_seconds": 300, + "ground_truth": "By mathematical induction...", + "ground_truth_score": 0.95, + "is_trap": false, + "previously_unsolved": false, + "tags": ["induction", "series", "algebra"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + } +] +``` + +### 6.2 — Benchmark Requirements + +| Domain | Min Tasks | Difficulty Spread | Trap Tasks | +|--------|-----------|-------------------|------------| +| Mathematics | 100 | 1-10 evenly | 15 | +| Code | 100 | 1-10 evenly | 15 | +| Scientific | 80 | 1-10 evenly | 12 | +| Strategic | 60 | 1-10 evenly | 9 | +| Causal | 60 | 1-10 evenly | 9 | +| Ethical | 50 | 1-8 (no 9-10) | 8 | +| **Total** | **450** | | **68** | + +### 6.3 — task_generator.py (Expanded) + +Add to existing MVP task_generator: + +```python +class TaskGenerator: + """Production task generator with benchmark DB + synthetic + API ingestion.""" + + def __init__(self, benchmark_dir: str = "benchmarks/"): + self.benchmark_db = self._load_benchmarks(benchmark_dir) + self.used_task_ids: set[str] = set() # Avoid repeats within window + + def generate_epoch_tasks(self, count: int, trap_rate: float) -> list[Task]: + """Generate a balanced set of tasks for one epoch.""" + n_traps = max(1, int(count * trap_rate)) + n_benchmark = count - n_traps + + tasks = [] + + # 1. Sample benchmark tasks (balanced across domains) + tasks += self._sample_balanced(n_benchmark) + + # 2. Add trap problems + tasks += self._sample_traps(n_traps) + + # 3. Shuffle to hide traps + random.shuffle(tasks) + + return tasks + + def ingest_api_task(self, request: dict) -> Task: + """Accept an external task submission via the API gateway.""" + # Validate, assign difficulty, create Task object + ... +``` + +--- + +## Phase 7: Persistent State & Recovery + +### 7.1 — state/database.py + +SQLite schema for persistent neuron state: + +```python +class StateDatabase: + SCHEMA = """ + CREATE TABLE IF NOT EXISTS miner_epochs ( + epoch_id INTEGER, + miner_uid INTEGER, + s_epoch REAL, + peb REAL, + rank INTEGER, + streak INTEGER, + tao_earned REAL, + trap_penalty REAL, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + PRIMARY KEY (epoch_id, miner_uid) + ); + + CREATE TABLE IF NOT EXISTS validator_epochs ( + epoch_id INTEGER, + validator_uid INTEGER, + vas REAL, + reputation_multiplier REAL, + tao_earned REAL, + slashed REAL, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + PRIMARY KEY (epoch_id, validator_uid) + ); + + CREATE TABLE IF NOT EXISTS task_results ( + task_id TEXT PRIMARY KEY, + epoch_id INTEGER, + domain TEXT, + difficulty INTEGER, + is_trap BOOLEAN, + avg_cms REAL, + best_miner_uid INTEGER, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP + ); + + CREATE TABLE IF NOT EXISTS submissions ( + submission_id TEXT PRIMARY KEY, + task_id TEXT, + miner_uid INTEGER, + cms REAL, + quality REAL, + accuracy REAL, + novelty REAL, + efficiency REAL, + submission_hash TEXT, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + FOREIGN KEY (task_id) REFERENCES task_results(task_id) + ); + + CREATE TABLE IF NOT EXISTS checkpoints ( + checkpoint_id INTEGER PRIMARY KEY AUTOINCREMENT, + epoch_id INTEGER, + state_blob TEXT, -- JSON serialized state + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP + ); + + CREATE TABLE IF NOT EXISTS embedding_history ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + epoch_id INTEGER, + miner_uid INTEGER, + task_id TEXT, + embedding BLOB, -- numpy array bytes + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP + ); + + CREATE INDEX IF NOT EXISTS idx_miner_epochs_uid ON miner_epochs(miner_uid); + CREATE INDEX IF NOT EXISTS idx_submissions_miner ON submissions(miner_uid); + CREATE INDEX IF NOT EXISTS idx_submissions_task ON submissions(task_id); + """ +``` + +### 7.2 — state/checkpoint.py + +```python +class CheckpointManager: + """Save and restore full neuron state for crash recovery.""" + + def save(self, db: StateDatabase, epoch_id: int, state: dict): + """Serialize state to JSON and store in checkpoints table.""" + ... + + def load_latest(self, db: StateDatabase) -> Optional[dict]: + """Load most recent checkpoint for crash recovery.""" + ... + + def prune_old(self, db: StateDatabase, keep_last: int = 10): + """Remove old checkpoints to save disk space.""" + ... +``` + +--- + +## Phase 8: API Gateway & External Access + +### 8.1 — gateway/app.py + +External-facing FastAPI application for users to submit tasks and query results: + +```python +app = FastAPI(title="ReasonForge Gateway", version="0.1.0") + +# Endpoints: +# POST /v1/tasks — Submit a reasoning task (authenticated) +# GET /v1/tasks/{task_id} — Get task status and results +# GET /v1/leaderboard — Current miner rankings +# GET /v1/stats — Network statistics +# GET /v1/health — Health check +# WS /v1/stream — WebSocket for real-time epoch updates + +@app.post("/v1/tasks", dependencies=[Depends(verify_api_key)]) +async def submit_task(request: TaskSubmissionRequest): + """Submit a reasoning task to the network.""" + # 1. Validate input + # 2. Assign difficulty (if not provided) + # 3. Queue for next epoch + # 4. Return task_id for polling + ... + +@app.get("/v1/tasks/{task_id}") +async def get_task_result(task_id: str): + """Poll for task results.""" + ... + +@app.get("/v1/leaderboard") +async def get_leaderboard(domain: Optional[str] = None, limit: int = 20): + """Get current miner rankings.""" + ... +``` + +### 8.2 — gateway/auth.py + +```python +class APIKeyManager: + """API key management with usage tracking and rate limits.""" + + def __init__(self, db: StateDatabase): + self.db = db + + def create_key(self, owner: str, tier: str = "free") -> str: + """Generate new API key. Tiers: free (100 req/mo), pro (10k), enterprise (unlimited).""" + ... + + def verify_key(self, key: str) -> Optional[APIKeyInfo]: + """Validate key and check rate limits.""" + ... + + def track_usage(self, key: str, task_id: str): + """Record API usage for billing.""" + ... +``` + +### 8.3 — gateway/schemas.py + +```python +class TaskSubmissionRequest(BaseModel): + problem: str = Field(..., min_length=10, max_length=10000) + domain: Optional[str] = None # Auto-detect if not provided + difficulty: Optional[int] = Field(None, ge=1, le=10) + timeout_seconds: Optional[int] = Field(300, ge=30, le=600) + callback_url: Optional[str] = None # Webhook for async notification + +class TaskResultResponse(BaseModel): + task_id: str + status: str # "queued"|"processing"|"completed"|"failed" + result: Optional[dict] = None + best_answer: Optional[str] = None + confidence: Optional[float] = None + reasoning_steps: Optional[list[dict]] = None + processing_time_ms: Optional[int] = None +``` + +--- + +## Phase 9: Monitoring & Observability + +### 9.1 — monitoring/metrics.py + +```python +from prometheus_client import Counter, Histogram, Gauge, start_http_server + +class MetricsCollector: + """Prometheus metrics for subnet monitoring.""" + + def __init__(self, neuron_type: str, uid: int): + prefix = f"reasonforge_{neuron_type}" + + # Counters + self.tasks_processed = Counter(f"{prefix}_tasks_total", "Total tasks processed", ["domain", "difficulty"]) + self.epochs_completed = Counter(f"{prefix}_epochs_total", "Epochs completed") + self.traps_injected = Counter(f"{prefix}_traps_total", "Trap problems injected") + self.breakthroughs = Counter(f"{prefix}_breakthroughs_total", "Breakthrough solutions") + self.plagiarism_detected = Counter(f"{prefix}_plagiarism_total", "Plagiarism detections") + self.weight_set_failures = Counter(f"{prefix}_weight_failures_total", "Weight setting failures") + + # Histograms + self.task_latency = Histogram(f"{prefix}_task_latency_seconds", "Task processing time", ["domain"]) + self.cms_distribution = Histogram(f"{prefix}_cms_score", "CMS score distribution", buckets=[0.1*i for i in range(11)]) + self.vas_distribution = Histogram(f"{prefix}_vas_score", "VAS score distribution", buckets=[0.1*i for i in range(11)]) + + # Gauges + self.current_epoch = Gauge(f"{prefix}_current_epoch", "Current epoch number") + self.active_miners = Gauge(f"{prefix}_active_miners", "Number of active miners") + self.avg_cms = Gauge(f"{prefix}_avg_cms", "Average CMS this epoch") + self.total_emission = Gauge(f"{prefix}_total_emission_tao", "Total TAO emitted") + self.top_miner_score = Gauge(f"{prefix}_top_miner_score", "Highest S_epoch") + + # Start metrics server + start_http_server(9090 + uid) +``` + +### 9.2 — monitoring/logger.py + +```python +import structlog + +def setup_logging(neuron_type: str, uid: int, debug: bool = False): + """Configure structured JSON logging.""" + structlog.configure( + processors=[ + structlog.stdlib.add_log_level, + structlog.stdlib.add_logger_name, + structlog.processors.TimeStamper(fmt="iso"), + structlog.processors.JSONRenderer(), + ], + context_class=dict, + logger_factory=structlog.stdlib.LoggerFactory(), + ) + + logger = structlog.get_logger() + return logger.bind(neuron_type=neuron_type, uid=uid) +``` + +### 9.3 — Grafana Dashboards + +Create two dashboard JSON files: + +**subnet_overview.json** — Panels: +- Total tasks processed (counter) +- Average CMS per epoch (timeseries) +- Miner count (gauge) +- Emission distribution (pie chart) +- Trap detection rate (timeseries) +- Weight setting success rate (timeseries) + +**miner_performance.json** — Panels: +- Per-miner S_epoch over time (multi-line) +- CMS dimension breakdown (stacked bar) +- PEB distribution (bar) +- Streak lengths (table) +- Plagiarism events (annotations) + +--- + +## Phase 10: Security Hardening + +### 10.1 — security/sanitizer.py + +```python +class InputSanitizer: + """Validate and sanitize all inputs from miners and external API.""" + + MAX_STEP_LENGTH = 10000 # chars per reasoning step + MAX_STEPS = 50 # max steps per submission + MAX_ANSWER_LENGTH = 50000 # chars + MAX_PROOF_SIZE = 1_000_000 # bytes (1MB) + MAX_CODE_SIZE = 500_000 # bytes (500KB) + + @staticmethod + def sanitize_submission(response: ReasoningTask) -> ReasoningTask: + """Validate all miner-provided fields.""" + # 1. Truncate oversized fields + if response.reasoning_steps and len(response.reasoning_steps) > InputSanitizer.MAX_STEPS: + response.reasoning_steps = response.reasoning_steps[:InputSanitizer.MAX_STEPS] + + # 2. Strip potential injection in reasoning text + if response.reasoning_steps: + for step in response.reasoning_steps: + step["reasoning"] = step.get("reasoning", "")[:InputSanitizer.MAX_STEP_LENGTH] + + # 3. Validate proof artifact size + if response.proof_artifact: + decoded = base64.b64decode(response.proof_artifact) + if len(decoded) > InputSanitizer.MAX_PROOF_SIZE: + response.proof_artifact = None + + # 4. Validate code artifact size + if response.code_artifact: + decoded = base64.b64decode(response.code_artifact) + if len(decoded) > InputSanitizer.MAX_CODE_SIZE: + response.code_artifact = None + + return response +``` + +### 10.2 — security/rate_guard.py + +```python +class RateGuard: + """Per-UID rate limiting to prevent DoS.""" + + def __init__(self, max_requests_per_minute: int = 10): + self.limits: dict[int, list[float]] = defaultdict(list) + self.max_rpm = max_requests_per_minute + + def check(self, uid: int) -> bool: + """Returns True if request is allowed.""" + now = time.time() + self.limits[uid] = [t for t in self.limits[uid] if now - t < 60] + if len(self.limits[uid]) >= self.max_rpm: + return False + self.limits[uid].append(now) + return True +``` + +### 10.3 — security/anomaly.py + +```python +class AnomalyDetector: + """Detect suspicious miner behavior patterns.""" + + def check_timing_anomaly(self, time_ms: int, difficulty: int) -> bool: + """Flag if solve time is unrealistically fast for difficulty.""" + min_expected = difficulty * 500 # ms + return time_ms < min_expected + + def check_score_manipulation(self, cms_history: list[float]) -> bool: + """Flag if CMS scores are suspiciously consistent (gaming).""" + if len(cms_history) < 5: + return False + variance = statistics.variance(cms_history) + return variance < 0.001 # Nearly identical scores = suspicious + + def check_collusion(self, submissions: list[ReasoningTask]) -> list[tuple[int, int, float]]: + """Detect colluding miners with near-identical submissions.""" + # Compare all pairs, flag if similarity > threshold + ... +``` + +--- + +## Phase 11: Docker & Deployment + +### 11.1 — docker/Dockerfile.miner + +```dockerfile +FROM python:3.12-slim + +WORKDIR /app + +# System deps +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential git curl && rm -rf /var/lib/apt/lists/* + +# Python deps +COPY requirements.txt requirements-miner.txt ./ +RUN pip install --no-cache-dir -r requirements.txt -r requirements-miner.txt + +# App code +COPY reasonforge/ reasonforge/ +COPY neurons/miner.py neurons/ + +# Wallet mount point +VOLUME /root/.bittensor/wallets + +# Metrics port +EXPOSE 9091 + +# Entry +ENTRYPOINT ["python", "neurons/miner.py"] +CMD ["--netuid", "XX", "--subtensor.network", "finney"] +``` + +### 11.2 — docker/Dockerfile.validator + +```dockerfile +FROM python:3.12-slim + +WORKDIR /app + +# System deps (includes docker-cli for sandbox) +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential git curl docker.io && rm -rf /var/lib/apt/lists/* + +# Install Lean 4 (optional, for math verification) +RUN curl -sSf https://raw.githubusercontent.com/leanprover/elan/master/elan-init.sh | sh -s -- -y --default-toolchain leanprover/lean4:stable +ENV PATH="/root/.elan/bin:$PATH" + +# Python deps +COPY requirements.txt requirements-validator.txt ./ +RUN pip install --no-cache-dir -r requirements.txt -r requirements-validator.txt + +# App code +COPY reasonforge/ reasonforge/ +COPY neurons/validator.py neurons/ +COPY benchmarks/ benchmarks/ + +# Volumes +VOLUME /root/.bittensor/wallets +VOLUME /app/state + +# Ports: metrics + API +EXPOSE 9092 8092 + +ENTRYPOINT ["python", "neurons/validator.py"] +CMD ["--netuid", "XX", "--subtensor.network", "finney", "--validator.sandbox_enabled", "--validator.lean4_enabled"] +``` + +### 11.3 — docker-compose.yml + +```yaml +version: "3.8" + +services: + validator: + build: + context: . + dockerfile: docker/Dockerfile.validator + volumes: + - ~/.bittensor/wallets:/root/.bittensor/wallets:ro + - validator-state:/app/state + - /var/run/docker.sock:/var/run/docker.sock # For sandbox containers + environment: + - NETUID=${NETUID} + - SUBTENSOR_NETWORK=${SUBTENSOR_NETWORK:-finney} + ports: + - "9092:9092" # Metrics + - "8092:8092" # API + restart: unless-stopped + + miner: + build: + context: . + dockerfile: docker/Dockerfile.miner + volumes: + - ~/.bittensor/wallets:/root/.bittensor/wallets:ro + environment: + - NETUID=${NETUID} + - SUBTENSOR_NETWORK=${SUBTENSOR_NETWORK:-finney} + - OPENAI_API_KEY=${OPENAI_API_KEY} + ports: + - "9091:9091" + - "8091:8091" + restart: unless-stopped + + gateway: + build: + context: . + dockerfile: docker/Dockerfile.gateway + volumes: + - gateway-data:/app/data + ports: + - "8000:8000" + restart: unless-stopped + + sandbox: + build: + context: . + dockerfile: docker/Dockerfile.sandbox + # Not a service — built as an image for validator to spawn containers from + + prometheus: + image: prom/prometheus:latest + volumes: + - ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml + - prometheus-data:/prometheus + ports: + - "9090:9090" + + grafana: + image: grafana/grafana:latest + volumes: + - ./monitoring/grafana/dashboards:/var/lib/grafana/dashboards + - grafana-data:/var/lib/grafana + ports: + - "3000:3000" + environment: + - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_PASSWORD:-admin} + +volumes: + validator-state: + gateway-data: + prometheus-data: + grafana-data: +``` + +### 11.4 — scripts/run_localnet.sh + +```bash +#!/bin/bash +# Start a local subtensor for development & testing +# Requires: subtensor repo cloned and built + +set -e + +echo "Starting local subtensor..." +cd ~/subtensor +./scripts/localnet.sh & + +sleep 10 + +echo "Creating wallets..." +btcli wallet new_coldkey --wallet.name owner --no_password +btcli wallet new_coldkey --wallet.name validator --no_password +btcli wallet new_hotkey --wallet.name validator --wallet.hotkey default +btcli wallet new_coldkey --wallet.name miner --no_password +btcli wallet new_hotkey --wallet.name miner --wallet.hotkey default + +echo "Funding wallets..." +btcli wallet faucet --wallet.name owner --subtensor.chain_endpoint ws://127.0.0.1:9946 +btcli wallet faucet --wallet.name validator --subtensor.chain_endpoint ws://127.0.0.1:9946 +btcli wallet faucet --wallet.name miner --subtensor.chain_endpoint ws://127.0.0.1:9946 + +echo "Creating subnet..." +btcli subnets create --wallet.name owner --subtensor.chain_endpoint ws://127.0.0.1:9946 + +echo "Registering neurons..." +btcli subnets register --wallet.name miner --wallet.hotkey default --netuid 1 --subtensor.chain_endpoint ws://127.0.0.1:9946 +btcli subnets register --wallet.name validator --wallet.hotkey default --netuid 1 --subtensor.chain_endpoint ws://127.0.0.1:9946 + +echo "Staking to validator..." +btcli stake add --wallet.name validator --wallet.hotkey default --amount 100 --subtensor.chain_endpoint ws://127.0.0.1:9946 + +echo "Registering validator on root subnet..." +btcli root register --wallet.name validator --wallet.hotkey default --subtensor.chain_endpoint ws://127.0.0.1:9946 +btcli root boost --netuid 1 --increase 1 --wallet.name validator --wallet.hotkey default --subtensor.chain_endpoint ws://127.0.0.1:9946 + +echo "✅ Localnet ready. NETUID=1" +echo "Run miner: python neurons/miner.py --netuid 1 --subtensor.chain_endpoint ws://127.0.0.1:9946 --wallet.name miner --wallet.hotkey default" +echo "Run validator: python neurons/validator.py --netuid 1 --subtensor.chain_endpoint ws://127.0.0.1:9946 --wallet.name validator --wallet.hotkey default" +``` + +--- + +## Phase 12: CI/CD & Testing Infrastructure + +### 12.1 — .github/workflows/test.yml + +```yaml +name: Tests +on: [push, pull_request] + +jobs: + unit-tests: + runs-on: ubuntu-latest + strategy: + matrix: + python: ["3.10", "3.11", "3.12"] + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python }} + - run: pip install -e ".[dev]" + - run: pytest tests/ -v --tb=short -x + + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: { python-version: "3.12" } + - run: pip install ruff mypy + - run: ruff check reasonforge/ neurons/ + - run: mypy reasonforge/ --ignore-missing-imports + + integration: + runs-on: ubuntu-latest + needs: unit-tests + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: { python-version: "3.12" } + - run: pip install -e ".[all]" + - run: pytest tests/test_integration_local.py -v --timeout=120 +``` + +### 12.2 — Test Categories + +| Test File | What It Tests | Depends On | +|-----------|---------------|------------| +| `test_engine.py` | All 13 formulas (MVP) | Nothing | +| `test_types.py` | Constants, dataclasses (MVP) | Nothing | +| `test_simulator.py` | Epoch simulation (MVP) | engine, types | +| `test_protocol.py` | Synapse serialization, hash verification | protocol.py | +| `test_scoring.py` | Dimension scoring, quality/accuracy/novelty/efficiency | scorer, engine | +| `test_verification.py` | Lean4 checker, code sandbox, math checker | verification/ | +| `test_embeddings.py` | Cosine similarity, plagiarism detection | embeddings/ | +| `test_state.py` | SQLite CRUD, checkpoint save/load, migration | state/ | +| `test_gateway.py` | API endpoints, auth, rate limiting | gateway/ | +| `test_security.py` | Input sanitization, rate guard, anomaly detection | security/ | +| `test_integration_local.py` | Full miner↔validator exchange on localnet | Everything | + +### 12.3 — test_integration_local.py (Key Test) + +```python +class TestLocalIntegration: + """ + End-to-end test: validator sends task → miner solves → validator scores → weights set. + Runs without a real blockchain — mocks subtensor. + """ + + def test_full_epoch_cycle(self): + """One complete epoch with mocked blockchain.""" + # 1. Create mock subtensor + metagraph + # 2. Initialize validator and miner + # 3. Validator generates tasks + # 4. Validator queries miner via localhost axon + # 5. Miner responds with reasoning chain + # 6. Validator scores responses + # 7. Validator computes weights + # 8. Verify: weights are normalized, best miner gets highest weight + # 9. Verify: emission conservation + # 10. Verify: state persisted to SQLite + + def test_trap_detection_live(self): + """Trap problem correctly penalizes a deliberately bad miner.""" + ... + + def test_plagiarism_detection_live(self): + """Two miners submitting identical answers get flagged.""" + ... + + def test_crash_recovery(self): + """Kill validator mid-epoch, restart, verify state restored.""" + ... +``` + +--- + +## Phase 13: Documentation & SDK + +### Required Documents + +| Document | Audience | Content | +|----------|----------|---------| +| `README.md` | Everyone | Project overview, quick start, architecture summary | +| `docs/ARCHITECTURE.md` | Developers | System design, data flow, component interactions | +| `docs/PROTOCOL.md` | Subnet devs | Wire protocol (Synapse types), message flow, serialization | +| `docs/MINER_GUIDE.md` | Miners | How to set up and run a miner, LLM backend selection, GPU requirements, earnings optimization | +| `docs/VALIDATOR_GUIDE.md` | Validators | How to run a validator, stake requirements, Lean4 setup, monitoring | +| `docs/API_REFERENCE.md` | API consumers | Endpoint specs, auth, rate limits, examples | +| `docs/DEPLOYMENT.md` | Operators | Docker deployment, environment vars, scaling, backup | +| `docs/SECURITY.md` | Security reviewers | Threat model, anti-adversarial mechanisms, input validation | +| `docs/BENCHMARKS.md` | Contributors | How to add benchmark tasks, format spec, review process | + +### MINER_GUIDE.md — Key Sections + +```markdown +## Requirements +- Python 3.10+ +- GPU recommended (for local LLM backend) +- Bittensor wallet with registered hotkey +- Registration cost: ~0.1 TAO + +## Quick Start +pip install -e ".[miner]" +python neurons/miner.py \ + --netuid \ + --subtensor.network finney \ + --wallet.name my_miner \ + --wallet.hotkey default \ + --miner.backend openai \ + --miner.model gpt-4o + +## Backend Options +| Backend | Flag | Requirements | Performance | +|---------|------|-------------|-------------| +| OpenAI | `--miner.backend openai` | API key | High (o1-level) | +| Anthropic | `--miner.backend anthropic` | API key | High | +| Local | `--miner.backend local` | GPU + model | Variable | +| Agent | `--miner.backend agent` | LangGraph + LLM | Highest potential | + +## Earning Optimization +1. Specialize in high-difficulty tasks (2× multiplier at difficulty 10) +2. Include formal proofs when possible (quality bonus) +3. Maintain consistency for PEB streak bonuses +4. Avoid plagiarism (0.5× penalty if detected) +``` + +--- + +## 16. Subnet Hyperparameters + +Set via `btcli subnets hyperparameters` or in subnet registration: + +```yaml +# min_compute.yml — Minimum compute requirements +min_compute: + miner: + cpu: 4 + ram_gb: 16 + storage_gb: 50 + gpu: optional # Depends on LLM backend + bandwidth_mbps: 100 + validator: + cpu: 8 + ram_gb: 32 + storage_gb: 100 + gpu: recommended # For embedding model + bandwidth_mbps: 200 + +# Subnet hyperparameters +subnet: + tempo: 360 # Blocks per epoch (~72 minutes) + immunity_period: 7200 # New neuron protection (~24 hours) + max_miners: 192 + max_validators: 64 + min_validator_stake: 1000 # TAO + weights_rate_limit: 100 # Blocks between weight updates + weights_version_key: 1 + adjustment_alpha: 0.7 + difficulty: 10000000 # POW registration difficulty + registration_cost: 0.1 # TAO burn registration +``` + +--- + +## 17. Environment Variables + +```bash +# .env.example + +# ── Bittensor ── +NETUID=XX # Your subnet UID (assigned after registration) +SUBTENSOR_NETWORK=finney # finney | test | local +SUBTENSOR_CHAIN_ENDPOINT= # Custom endpoint (optional) + +# ── Wallet ── +WALLET_NAME=my_wallet +WALLET_HOTKEY=default + +# ── Miner ── +MINER_BACKEND=openai # openai | anthropic | local | agent +MINER_MODEL=gpt-4o # Model identifier +OPENAI_API_KEY=sk-... # If using OpenAI backend +ANTHROPIC_API_KEY=sk-ant-... # If using Anthropic backend +MINER_PORT=8091 +MINER_MAX_CONCURRENT=4 + +# ── Validator ── +VALIDATOR_PORT=8092 +VALIDATOR_EPOCH_LENGTH=360 +VALIDATOR_TASKS_PER_EPOCH=12 +VALIDATOR_TRAP_RATE=0.15 +VALIDATOR_TIMEOUT=300 +VALIDATOR_SANDBOX_ENABLED=true +VALIDATOR_LEAN4_ENABLED=true +VALIDATOR_EMBEDDING_MODEL=all-MiniLM-L6-v2 + +# ── Gateway ── +GATEWAY_PORT=8000 +GATEWAY_API_KEY_SECRET=your-secret-key + +# ── Monitoring ── +PROMETHEUS_PORT=9090 +GRAFANA_PASSWORD=admin + +# ── State ── +STATE_DB_PATH=state/reasonforge.db +``` + +--- + +## 18. Build Order + +**Prerequisites:** MVP codebase complete and all tests passing. + +``` +PHASE 1 — Protocol Layer (Days 1-2) + Step 1: Write reasonforge/protocol.py (all 3 Synapse classes) + Step 2: Write reasonforge/base/config.py (CLI args) + Step 3: Write reasonforge/base/neuron.py (BaseNeuron) + Step 4: Write tests/test_protocol.py — verify Synapse serialization + Step 5: Run tests: pytest tests/test_protocol.py -v + +PHASE 2 — Miner Neuron (Days 3-5) + Step 6: Write reasonforge/miner/backends/base.py (LLMBackend ABC) + Step 7: Write reasonforge/miner/backends/openai_backend.py + Step 8: Write reasonforge/miner/backends/anthropic_backend.py + Step 9: Write reasonforge/miner/domain_router.py (6 domain prompts) + Step 10: Write reasonforge/miner/reasoning.py (ReasoningEngine) + Step 11: Write neurons/miner.py (full entry point with Axon) + +PHASE 3 — Validator Neuron (Days 6-9) + Step 12: Write reasonforge/validator/task_manager.py + Step 13: Write reasonforge/validator/trap_manager.py + Step 14: Write reasonforge/validator/objective_scorer.py + Step 15: Write reasonforge/validator/consensus.py (wraps MVP engine) + Step 16: Write reasonforge/validator/scoring.py (orchestrator) + Step 17: Write reasonforge/validator/weight_setter.py + Step 18: Write neurons/validator.py (full entry point with epoch loop) + Step 19: Write tests/test_scoring.py — verify scoring pipeline + Step 20: Run: pytest tests/test_scoring.py -v + +PHASE 4 — Verification Backends (Days 10-12) + Step 21: Write reasonforge/verification/math_checker.py (SymPy) + Step 22: Write reasonforge/verification/code_sandbox.py (Docker) + Step 23: Write reasonforge/verification/lean4_checker.py + Step 24: Write reasonforge/verification/fact_checker.py + Step 25: Write docker/Dockerfile.sandbox + Step 26: Write tests/test_verification.py + Step 27: Run: pytest tests/test_verification.py -v + +PHASE 5 — Plagiarism Detection (Day 13) + Step 28: Write reasonforge/embeddings/similarity.py + Step 29: Write tests/test_embeddings.py + Step 30: Run: pytest tests/test_embeddings.py -v + +PHASE 6 — Benchmark Database (Days 14-16) + Step 31: Create benchmark JSON files (450+ tasks across 6 domains) + Step 32: Expand reasonforge/task_generator.py (DB loading, balanced sampling) + Step 33: Write scripts/benchmark_import.py + Step 34: Write scripts/generate_traps.py (68 trap problems) + +PHASE 7 — Persistence (Days 17-18) + Step 35: Write reasonforge/state/database.py (SQLite schema + CRUD) + Step 36: Write reasonforge/state/checkpoint.py + Step 37: Write reasonforge/state/migrations.py + Step 38: Write tests/test_state.py + Step 39: Run: pytest tests/test_state.py -v + +PHASE 8 — API Gateway (Days 19-20) + Step 40: Write reasonforge/gateway/schemas.py + Step 41: Write reasonforge/gateway/auth.py + Step 42: Write reasonforge/gateway/rate_limiter.py + Step 43: Write reasonforge/gateway/billing.py + Step 44: Write reasonforge/gateway/app.py + Step 45: Write tests/test_gateway.py + Step 46: Run: pytest tests/test_gateway.py -v + +PHASE 9 — Monitoring (Days 21-22) + Step 47: Write reasonforge/monitoring/metrics.py (Prometheus) + Step 48: Write reasonforge/monitoring/logger.py (structlog) + Step 49: Write reasonforge/monitoring/health.py + Step 50: Create monitoring/prometheus.yml + Step 51: Create monitoring/grafana/dashboards/*.json + +PHASE 10 — Security (Day 23) + Step 52: Write reasonforge/security/sanitizer.py + Step 53: Write reasonforge/security/rate_guard.py + Step 54: Write reasonforge/security/anomaly.py + Step 55: Write tests/test_security.py + Step 56: Run: pytest tests/test_security.py -v + +PHASE 11 — Docker & Deployment (Days 24-25) + Step 57: Write docker/Dockerfile.miner + Step 58: Write docker/Dockerfile.validator + Step 59: Write docker/Dockerfile.gateway + Step 60: Write docker/docker-compose.yml + Step 61: Write docker/docker-compose.localnet.yml + Step 62: Write docker/docker-compose.monitoring.yml + Step 63: Write scripts/setup_wallets.sh + Step 64: Write scripts/register_subnet.sh + Step 65: Write scripts/register_neurons.sh + Step 66: Write scripts/run_localnet.sh + +PHASE 12 — CI/CD (Day 26) + Step 67: Write .github/workflows/test.yml + Step 68: Write .github/workflows/lint.yml + Step 69: Write .github/workflows/build-docker.yml + +PHASE 13 — Documentation (Days 27-28) + Step 70: Update README.md (production sections) + Step 71: Write docs/ARCHITECTURE.md + Step 72: Write docs/PROTOCOL.md + Step 73: Write docs/MINER_GUIDE.md + Step 74: Write docs/VALIDATOR_GUIDE.md + Step 75: Write docs/API_REFERENCE.md + Step 76: Write docs/DEPLOYMENT.md + Step 77: Write docs/SECURITY.md + Step 78: Write docs/BENCHMARKS.md + Step 79: Write min_compute.yml + +INTEGRATION & VERIFICATION (Days 29-30) + Step 80: Write tests/test_integration_local.py + Step 81: Run full test suite: pytest tests/ -v + Step 82: Run: docker-compose -f docker/docker-compose.localnet.yml up + Step 83: Verify miner↔validator exchange on localnet + Step 84: Verify weights are set on local chain + Step 85: Verify state persistence across restart + Step 86: Verify metrics appear in Prometheus/Grafana + Step 87: Verify API gateway responds correctly + Step 88: Run security audit: input fuzzing, oversized payloads, invalid hashes + Step 89: Final: ruff check + mypy on entire codebase + Step 90: Tag v0.1.0 release +``` + +--- + +## 19. Success Criteria + +### Unit Tests (must all pass) +- [ ] All 13 MVP formula tests still pass +- [ ] Synapse serialization roundtrip works +- [ ] Scoring pipeline produces correct dimension scores +- [ ] Lean4 checker verifies valid proof, rejects invalid +- [ ] Code sandbox executes safely, returns test results +- [ ] Embedding similarity detects plagiarism > 0.95 threshold +- [ ] SQLite state saves and loads correctly +- [ ] API gateway auth rejects invalid keys +- [ ] Input sanitizer truncates oversized submissions +- [ ] Anomaly detector flags suspiciously fast responses + +### Integration Tests (must all pass) +- [ ] Miner Axon starts and serves ReasoningTask Synapse +- [ ] Validator queries miner and receives valid response +- [ ] Full epoch cycle completes: tasks → scoring → weights +- [ ] Weights are normalized and submitted to chain +- [ ] Emission conservation holds (within rounding) +- [ ] Trap problems correctly penalize low-quality miners +- [ ] Plagiarism detection works across submissions in same batch +- [ ] State persists across validator restart +- [ ] Multiple miners compete, best scores highest weight + +### Localnet Tests +- [ ] `run_localnet.sh` completes without errors +- [ ] Miner and validator register UIDs on local chain +- [ ] Validator sets weights after first epoch +- [ ] Emissions flow to miner after subnet tempo +- [ ] `btcli subnets metagraph --netuid 1` shows correct data + +### Docker Tests +- [ ] All 4 Docker images build successfully +- [ ] `docker-compose up` starts full stack +- [ ] Sandbox container runs isolated code safely +- [ ] Prometheus scrapes metrics from both neurons +- [ ] Grafana dashboards render correctly + +### Security Tests +- [ ] Oversized submissions are truncated, not crashed +- [ ] Invalid submission hashes are detected and penalized +- [ ] Rate limiting prevents DoS from single UID +- [ ] Miner blacklists non-validator callers +- [ ] Code sandbox prevents filesystem/network access + +### Documentation +- [ ] README has working quick-start commands +- [ ] Miner guide covers all 4 backends +- [ ] Validator guide covers Lean4 + sandbox setup +- [ ] API reference documents all endpoints with examples +- [ ] Deployment guide covers Docker + bare metal + +--- + +## Appendix A: Dependency Versions + +``` +# Core +bittensor>=10.0.1 +torch>=2.0.0 +numpy>=1.24.0 +pydantic>=2.0.0 +structlog>=23.0.0 + +# Miner +openai>=1.0.0 +anthropic>=0.20.0 +transformers>=4.35.0 # For local backend +vllm>=0.3.0 # Optional: fast local inference +langchain>=0.1.0 # For agent backend +langgraph>=0.0.10 # For agent backend + +# Validator +sentence-transformers>=2.2.0 +sympy>=1.12 +docker>=7.0.0 # For code sandbox +prometheus-client>=0.19.0 + +# Gateway +fastapi>=0.100.0 +uvicorn>=0.23.0 +python-jose>=3.3.0 # JWT +passlib>=1.7.4 # Password hashing + +# Dev +pytest>=7.0.0 +pytest-asyncio>=0.21.0 +pytest-timeout>=2.1.0 +ruff>=0.1.0 +mypy>=1.6.0 +``` + +--- + +## Appendix B: Key Differences from MVP + +| Aspect | MVP | Production | +|--------|-----|------------| +| Miners | Statistical profiles (probabilistic) | Real LLMs via API/local inference | +| Validators | Simulated noise/bias profiles | Real scoring pipeline with verification | +| Network | None — in-memory simulation | Bittensor Axon/Dendrite over TCP | +| Scoring | All in ScoringEngine (kept!) | ScoringEngine + verification backends | +| Weights | Simulated emission distribution | On-chain via `subtensor.set_weights()` | +| State | In-memory, lost on exit | SQLite + checkpoint recovery | +| Plagiarism | Jaccard similarity | Sentence-transformer cosine similarity | +| Security | None needed | Full input sanitization, rate limiting | +| Monitoring | CLI print statements | Prometheus + Grafana + structured logging | +| Deployment | `python -m reasonforge.run` | Docker Compose + systemd | +| Testing | Unit tests only | Unit + integration + localnet + security | + +--- + +*End of production build plan. This document assumes the MVP from PLAN.md is complete and transforms it into a deployable Bittensor subnet.* diff --git a/PLAN_PROOF_LAYER.md b/PLAN_PROOF_LAYER.md new file mode 100644 index 0000000..df69878 --- /dev/null +++ b/PLAN_PROOF_LAYER.md @@ -0,0 +1,2347 @@ +# ReasonForge — The Proof Layer for AI + +## PLAN_PROOF_LAYER.md + +> **What this is**: The plan that transforms ReasonForge from a Bittensor subnet into +> a new category of infrastructure — machine-verifiable trust for AI reasoning. +> +> **Prerequisite**: MVP (PLAN.md) complete. Production subnet (PLAN_PRODUCTION.md) in progress or complete. +> +> **One-line thesis**: Any AI model's reasoning chain in → cryptographic proof of correctness out. +> HTTPS did this for web traffic. We do this for AI thought. +> +> **Build time**: 12-16 weeks for core. 6 months to production-grade. + +--- + +## Table of Contents + +``` +PART I — THE PIVOT + 1. Why We're Narrowing + 2. What We Kill + 3. What We Keep + 4. The New Architecture + +PART II — FORMAL VERIFICATION ENGINE + 5. NL-to-Formal Translation Pipeline + 6. Lean 4 Verification Backend + 7. Code Verification Backend + 8. First-Order Logic Backend + 9. Step-Level Process Supervision + 10. Verification Verdicts & Failure Localization + +PART III — ZK VERIFICATION CERTIFICATES + 11. Certificate Schema + 12. ZK Circuit Design + 13. Recursive Proof Composition + 14. On-Chain Certificate Registry + 15. Certificate Verification Contract + +PART IV — BITTENSOR INTEGRATION (REVISED) + 16. New Synapse Protocol + 17. Miner Role: Translator + 18. Validator Role: Verifier + 19. Revised Incentive Mechanism + 20. Weight Computation + +PART V — ENTERPRISE API PRODUCT + 21. Verification-as-a-Service API + 22. SDK (Python / TypeScript / Rust) + 23. Model Provider Integrations + 24. Compliance Report Generator + +PART VI — BUILD ORDER & MILESTONES + 25. Directory Structure + 26. Phase-by-Phase Build Order + 27. Success Criteria + 28. Dependency Map +``` + +--- + +# PART I — THE PIVOT + +--- + +## 1. Why We're Narrowing + +The original ReasonForge design scores AI reasoning across 6 domains using heuristics. +The problem: heuristic scoring is gameable, unverifiable, and no enterprise will pay for it. + +The insight: **only 3 types of reasoning can be mechanically proven correct**: + +| Domain | Verification Method | Decidable? | Existing Tools | +|--------|---------------------|------------|----------------| +| Mathematics | Formal proof checkers (Lean 4, Coq, Isabelle) | **Yes** | Mature | +| Code | Execution + property-based testing + static analysis | **Yes** | Mature | +| Formal Logic | SAT/SMT solvers, model checkers | **Yes** | Mature | +| Scientific | ??? | No — requires domain expertise | None | +| Strategic | ??? | No — requires simulation | None | +| Ethical | ??? | No — inherently subjective | None | + +We don't score reasoning. We **prove** it. If we can't prove it, we don't touch it. +This constraint is our moat. Everyone else is building better scorers. We build provers. + +--- + +## 2. What We Kill + +Remove entirely from the codebase: + +``` +- Domain: SCIENTIFIC → Cut. No mechanical verification possible. +- Domain: STRATEGIC → Cut. Game-theoretic verification is research-stage. +- Domain: CAUSAL → Cut. Causal inference verification requires SCMs we can't auto-generate. +- Domain: ETHICAL → Cut. Inherently subjective. No proof exists. +- Heuristic novelty scoring → Cut. Replace with proof/no-proof binary. +- Heuristic quality scoring → Cut. Replace with formal verification verdict. +- Consensus-based scoring (Eq. 12) → Restructure. Consensus on PROOF VALIDITY, not subjective quality. +- CMS dimensions (Q, A, N, E) → Replace with new scoring dimensions (see Section 19). +``` + +--- + +## 3. What We Keep + +From the MVP: + +``` +✓ ScoringEngine framework → Refactor with new formulas, keep architecture +✓ Emission distribution (Eq. 5) → Keep as-is +✓ PEB mechanism (Eq. 4) → Keep as-is +✓ Trap problems (Eq. 9) → Keep, now with formally verified ground truth +✓ Slashing (Eq. 10) → Keep as-is +✓ Simulator → Refactor for new scoring +✓ CLI runner → Keep +✓ Test infrastructure → Extend +``` + +From the production plan: + +``` +✓ Bittensor protocol layer → Rewrite Synapses for new roles +✓ Base neuron class → Keep +✓ State persistence → Keep +✓ Docker deployment → Keep +✓ Monitoring → Keep +✓ CI/CD → Keep +``` + +--- + +## 4. The New Architecture + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ EXTERNAL WORLD │ +│ │ +│ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌───────────────┐ │ +│ │ OpenAI │ │ Anthropic│ │ DeepSeek │ │ Any LLM/Agent │ │ +│ │ o1 / GPT │ │ Claude │ │ R1 │ │ Framework │ │ +│ └────┬─────┘ └────┬─────┘ └────┬─────┘ └──────┬────────┘ │ +│ │ │ │ │ │ +│ └──────────────┴──────────────┴───────────────┘ │ +│ │ │ +│ "Here is my reasoning chain" │ +│ │ │ +│ ▼ │ +│ ┌──────────────────────────────────────────────────────────┐ │ +│ │ REASONFORGE VERIFICATION API │ │ +│ │ │ │ +│ │ POST /v1/verify │ │ +│ │ { │ │ +│ │ "reasoning_chain": [...steps...], │ │ +│ │ "domain": "mathematics" | "code" | "logic", │ │ +│ │ "original_query": "...", │ │ +│ │ "claimed_answer": "...", │ │ +│ │ "proof_level": "formal" | "standard" | "quick" │ │ +│ │ } │ │ +│ └──────────────────────┬───────────────────────────────────┘ │ +│ │ │ +└─────────────────────────┼───────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ BITTENSOR SUBNET │ +│ │ +│ ┌─────────────────────────────────────────────────────────┐ │ +│ │ VALIDATORS │ │ +│ │ │ │ +│ │ 1. Receive verification request │ │ +│ │ 2. Dispatch to N miners (translators) │ │ +│ │ 3. Collect formal translations │ │ +│ │ 4. Run mechanical verification (Lean4/Sandbox/SMT) │ │ +│ │ 5. Generate verification verdict │ │ +│ │ 6. Produce ZK certificate │ │ +│ │ 7. Set on-chain weights │ │ +│ │ │ │ +│ └───────────────┬──────────────────────┬───────────────────┘ │ +│ │ │ │ +│ ┌───────▼───────┐ ┌───────▼───────┐ │ +│ │ MINERS │ │ MINERS │ │ +│ │ (Translators) │ │ (Translators) │ × N │ +│ │ │ │ │ │ +│ │ NL reasoning │ │ NL reasoning │ │ +│ │ ↓ │ │ ↓ │ │ +│ │ Lean 4 proof │ │ Lean 4 proof │ │ +│ │ — OR — │ │ — OR — │ │ +│ │ Test suite │ │ Test suite │ │ +│ │ — OR — │ │ — OR — │ │ +│ │ FOL formula │ │ FOL formula │ │ +│ └───────────────┘ └───────────────┘ │ +│ │ +│ ┌─────────────────────────────────────────────────────────┐ │ +│ │ ZK CERTIFICATE LAYER │ │ +│ │ │ │ +│ │ Inputs: │ │ +│ │ - Verification verdict (pass/fail per step) │ │ +│ │ - Validator signatures (N of M) │ │ +│ │ - Task metadata hash │ │ +│ │ │ │ +│ │ Output: │ │ +│ │ - ZK-SNARK proof that: │ │ +│ │ ✓ Reasoning was formally verified │ │ +│ │ ✓ N independent validators confirmed │ │ +│ │ ✓ Each step has mechanical proof │ │ +│ │ — WITHOUT revealing the reasoning chain │ │ +│ │ │ │ +│ │ On-chain: │ │ +│ │ - Certificate registry (EVM contract) │ │ +│ │ - Verify in O(1): verifyProof(certificate) → bool │ │ +│ │ │ │ +│ └─────────────────────────────────────────────────────────┘ │ +│ │ +└─────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ RETURNED TO CALLER │ +│ │ +│ { │ +│ "certificate_id": "0xabc...def", │ +│ "verdict": "VERIFIED" | "PARTIAL" | "FAILED", │ +│ "steps_verified": 7, │ +│ "steps_total": 7, │ +│ "failure_points": [], // empty if all pass │ +│ "proof": "0x...", // ZK-SNARK proof bytes │ +│ "registry_tx": "0x...", // On-chain registration │ +│ "verification_time_ms": 4200, │ +│ "validators_participated": 5, │ +│ "confidence": 1.0, // Binary: proved or not │ +│ "lean4_proofs": [...], // Optional: raw proofs │ +│ "verify_url": "https://verify.reasonforge.ai/0xabc...def" │ +│ } │ +│ │ +└─────────────────────────────────────────────────────────────────┘ +``` + +**The fundamental role change:** + +| | Old Design | New Design | +|--|------------|------------| +| **Miners do** | Solve reasoning problems | Translate NL reasoning → formal proofs | +| **Validators do** | Score reasoning quality (heuristic) | Run mechanical verification (deterministic) | +| **Output is** | Subjective score (0.0 - 1.0) | Binary proof certificate (VERIFIED / FAILED) | +| **Customer gets** | "This reasoning scored 0.87" | "This reasoning is mathematically proven correct" | + +--- + +# PART II — FORMAL VERIFICATION ENGINE + +--- + +## 5. NL-to-Formal Translation Pipeline + +This is the core innovation. Miners receive a natural language reasoning chain and +translate each step into a formally verifiable representation. + +### 5.1 — Translation Interface + +```python +class TranslationRequest: + """What the miner receives.""" + task_id: str + original_query: str # The question that was asked + reasoning_chain: list[dict] # [{step_id, content, claimed_conclusion}] + domain: str # "mathematics" | "code" | "logic" + difficulty: int # 1-10 + proof_level: str # "formal" | "standard" | "quick" + +class TranslationResult: + """What the miner returns.""" + task_id: str + translations: list[StepTranslation] + compilation_status: str # "COMPILED" | "PARTIAL" | "FAILED" + full_proof: Optional[str] # Complete Lean4/code/FOL artifact + +class StepTranslation: + """One reasoning step translated to formal representation.""" + step_id: int + original_content: str # NL text (echoed back for alignment) + formal_representation: str # Lean4 / Python+tests / FOL formula + dependencies: list[int] # Which previous steps this depends on + translation_confidence: float # Miner's self-reported confidence + compilation_check: bool # Did this step compile in isolation? + notes: Optional[str] # Miner's notes on translation choices +``` + +### 5.2 — Domain-Specific Translation Strategies + +**Mathematics → Lean 4:** + +``` +NL: "Since n is even, we can write n = 2k for some integer k" + ↓ (Miner translates) +Lean 4: + theorem step_3 (n : ℤ) (h : Even n) : ∃ k : ℤ, n = 2 * k := by + exact h + +NL: "Substituting n = 2k into n² gives (2k)² = 4k²" + ↓ +Lean 4: + theorem step_4 (k : ℤ) : (2 * k) ^ 2 = 4 * k ^ 2 := by + ring +``` + +**Code → Executable Tests + Property Checks:** + +``` +NL: "We use a hash map to count frequencies, achieving O(n) time" + ↓ (Miner translates) +Python: + def count_frequencies(arr: list[int]) -> dict[int, int]: + freq = {} + for x in arr: + freq[x] = freq.get(x, 0) + 1 + return freq + + # Correctness tests + def test_basic(): + assert count_frequencies([1,2,2,3]) == {1:1, 2:2, 3:1} + + def test_empty(): + assert count_frequencies([]) == {} + + # Property-based test + from hypothesis import given, strategies as st + + @given(st.lists(st.integers(min_value=-1000, max_value=1000))) + def test_sum_preserved(arr): + freq = count_frequencies(arr) + assert sum(freq.values()) == len(arr) + + # Complexity verification (statistical) + def test_linear_time(): + import time + for n in [1000, 10000, 100000]: + arr = list(range(n)) + start = time.perf_counter() + count_frequencies(arr) + elapsed = time.perf_counter() - start + # Should scale linearly (within 3x for 10x input) + ratio = elapsed_100k / elapsed_10k + assert ratio < 15 # Linear would be ~10, allow margin +``` + +**Formal Logic → FOL + SMT-LIB:** + +``` +NL: "All mammals are warm-blooded. Whales are mammals. Therefore whales are warm-blooded." + ↓ (Miner translates) +SMT-LIB: + (declare-sort Animal) + (declare-fun Mammal (Animal) Bool) + (declare-fun WarmBlooded (Animal) Bool) + (declare-fun IsWhale (Animal) Bool) + + ; Premise 1: All mammals are warm-blooded + (assert (forall ((x Animal)) (=> (Mammal x) (WarmBlooded x)))) + + ; Premise 2: Whales are mammals + (assert (forall ((x Animal)) (=> (IsWhale x) (Mammal x)))) + + ; Negation of conclusion (to prove by contradiction) + (declare-const w Animal) + (assert (IsWhale w)) + (assert (not (WarmBlooded w))) + + (check-sat) ; Expected: UNSAT (meaning conclusion is valid) +``` + +### 5.3 — Translation Quality Tiers + +```python +class ProofLevel(str, Enum): + FORMAL = "formal" # Full Lean4/Coq proof. Strongest guarantee. Slowest. + STANDARD = "standard" # Executable tests + property checks. Good balance. + QUICK = "quick" # Type checking + basic assertions. Fast, weaker guarantee. +``` + +| Level | Math | Code | Logic | +|-------|------|------|-------| +| **Formal** | Full Lean 4 proof, every step | Property-based tests + formal spec (TLA+/Dafny) | Complete FOL proof in Lean 4 | +| **Standard** | Lean 4 key lemmas + SymPy numerical | Unit tests + hypothesis + complexity | SMT-LIB + Z3 satisfiability | +| **Quick** | SymPy symbolic verification | Type checking + basic assertions | Propositional logic SAT check | + +Pricing scales with proof level. Formal costs 10× quick. This is how we monetize. + +--- + +## 6. Lean 4 Verification Backend + +### 6.1 — Lean 4 Project Template + +Every math verification task gets a fresh Lean 4 project: + +``` +lean_workspace/ +├── lakefile.lean # Lake build config +├── ReasonForge.lean # Main entry +├── ReasonForge/ +│ ├── Context.lean # Problem statement + given assumptions +│ ├── Step1.lean # Translation of reasoning step 1 +│ ├── Step2.lean # Translation of reasoning step 2 +│ ├── ... +│ ├── StepN.lean # Translation of reasoning step N +│ └── Chain.lean # Full chain: imports all steps, proves final theorem +└── lean-toolchain # Lean version pinning +``` + +### 6.2 — Chain.lean (The Key File) + +```lean +-- Chain.lean: Proves the entire reasoning chain is valid +-- Auto-generated by validator from miner translations + +import ReasonForge.Context +import ReasonForge.Step1 +import ReasonForge.Step2 +import ReasonForge.Step3 + +-- The final theorem that connects all steps +-- If this compiles, the entire reasoning chain is formally verified. +theorem reasoning_chain_valid + (assumptions : ProblemContext) + (s1 : Step1Result assumptions) + (s2 : Step2Result s1) + (s3 : Step3Result s2) + : FinalConclusion s3 := by + exact final_proof s1 s2 s3 +``` + +### 6.3 — Lean4Verifier Class + +```python +class Lean4Verifier: + """ + Production Lean 4 verification backend. + Compiles miner translations, extracts per-step verdicts. + """ + + def __init__(self, lean_toolchain: str = "leanprover/lean4:v4.8.0"): + self.toolchain = lean_toolchain + self.workspace_dir = Path("lean_workspaces") + self.workspace_dir.mkdir(exist_ok=True) + self.timeout = 120 # seconds per verification + + async def verify_chain( + self, + task_id: str, + translations: list[StepTranslation], + context: str, + ) -> VerificationVerdict: + """ + Full verification pipeline: + 1. Create Lean 4 project from translations + 2. Compile with `lake build` + 3. Parse output for per-step success/failure + 4. Return structured verdict + """ + workspace = self.workspace_dir / task_id + workspace.mkdir(exist_ok=True) + + try: + # 1. Generate project files + self._generate_lakefile(workspace) + self._generate_context(workspace, context) + for trans in translations: + self._generate_step(workspace, trans) + self._generate_chain(workspace, translations) + + # 2. Compile + result = await self._run_lake_build(workspace) + + # 3. Parse results + step_verdicts = self._parse_compilation_output(result, translations) + + # 4. Determine overall verdict + all_passed = all(sv.verified for sv in step_verdicts) + partial = any(sv.verified for sv in step_verdicts) + + return VerificationVerdict( + task_id=task_id, + overall="VERIFIED" if all_passed else ("PARTIAL" if partial else "FAILED"), + step_verdicts=step_verdicts, + total_steps=len(translations), + verified_steps=sum(1 for sv in step_verdicts if sv.verified), + failure_points=[sv for sv in step_verdicts if not sv.verified], + raw_output=result.stdout, + compilation_time_ms=result.elapsed_ms, + ) + + finally: + # Cleanup workspace (or archive for audit) + shutil.rmtree(workspace, ignore_errors=True) + + async def _run_lake_build(self, workspace: Path) -> CompilationResult: + """Run `lake build` with timeout and resource limits.""" + start = time.monotonic() + process = await asyncio.create_subprocess_exec( + "lake", "build", + cwd=workspace, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + try: + stdout, stderr = await asyncio.wait_for( + process.communicate(), timeout=self.timeout + ) + elapsed = int((time.monotonic() - start) * 1000) + return CompilationResult( + success=process.returncode == 0, + stdout=stdout.decode(), + stderr=stderr.decode(), + elapsed_ms=elapsed, + ) + except asyncio.TimeoutError: + process.kill() + return CompilationResult( + success=False, + stdout="", + stderr="TIMEOUT", + elapsed_ms=self.timeout * 1000, + ) + + def _parse_compilation_output( + self, result: CompilationResult, translations: list[StepTranslation] + ) -> list[StepVerdict]: + """ + Parse Lean 4 compiler output to determine which steps succeeded. + Lean reports errors with file:line:col format → map back to steps. + """ + verdicts = [] + for trans in translations: + step_file = f"Step{trans.step_id}.lean" + # Check if this file had any errors in compiler output + has_error = step_file in result.stderr and "error" in result.stderr + verdicts.append(StepVerdict( + step_id=trans.step_id, + verified=not has_error and result.success, + error_message=self._extract_error(result.stderr, step_file) if has_error else None, + formal_representation=trans.formal_representation, + )) + return verdicts +``` + +--- + +## 7. Code Verification Backend + +### 7.1 — CodeVerifier Class + +```python +class CodeVerifier: + """ + Verify code reasoning through execution, testing, and static analysis. + All execution happens in Docker sandbox — zero trust on miner code. + """ + + def __init__(self, sandbox_image: str = "reasonforge-sandbox:latest"): + self.sandbox = CodeSandbox(image=sandbox_image) + self.timeout = 60 + + async def verify_chain( + self, + task_id: str, + translations: list[StepTranslation], + original_code_claim: str, + ) -> VerificationVerdict: + """ + For each step: + 1. Extract code + tests from translation + 2. Run in sandbox + 3. Check: tests pass? Properties hold? Types check? + """ + step_verdicts = [] + + for trans in translations: + # Parse the translation into code and test components + components = self._parse_code_translation(trans) + + # Execute in sandbox + exec_result = await self.sandbox.execute( + code=components.implementation, + tests=components.tests, + property_tests=components.property_tests, + timeout=self.timeout, + ) + + # Determine verdict + tests_passed = exec_result.tests_passed == exec_result.tests_total + properties_hold = exec_result.property_violations == 0 + + step_verdicts.append(StepVerdict( + step_id=trans.step_id, + verified=tests_passed and properties_hold, + error_message=exec_result.error if not tests_passed else None, + details={ + "tests_passed": exec_result.tests_passed, + "tests_total": exec_result.tests_total, + "property_violations": exec_result.property_violations, + "coverage_percent": exec_result.coverage, + "execution_time_ms": exec_result.elapsed_ms, + }, + )) + + all_passed = all(sv.verified for sv in step_verdicts) + return VerificationVerdict( + task_id=task_id, + overall="VERIFIED" if all_passed else "PARTIAL" if any(sv.verified for sv in step_verdicts) else "FAILED", + step_verdicts=step_verdicts, + total_steps=len(translations), + verified_steps=sum(1 for sv in step_verdicts if sv.verified), + failure_points=[sv for sv in step_verdicts if not sv.verified], + ) +``` + +### 7.2 — Sandbox Execution Protocol + +```python +class SandboxExecution: + """ + Docker sandbox for untrusted code execution. + + Security model: + - No network access + - No filesystem beyond /tmp (tmpfs, 64MB) + - CPU limited to 50% of one core + - Memory limited to 512MB + - Process count limited to 50 + - No capabilities + - Read-only rootfs + - Non-root user + - Timeout enforced externally + """ + + SANDBOX_CONFIG = { + "network_disabled": True, + "read_only": True, + "mem_limit": "512m", + "memswap_limit": "512m", # No swap + "cpu_period": 100000, + "cpu_quota": 50000, # 50% of one core + "pids_limit": 50, + "tmpfs": {"/tmp": "size=64m"}, + "cap_drop": ["ALL"], + "security_opt": ["no-new-privileges"], + "user": "sandbox", + } +``` + +--- + +## 8. First-Order Logic Backend + +### 8.1 — FOLVerifier Class + +```python +class FOLVerifier: + """ + Verify logical reasoning using SMT solvers (Z3, CVC5). + Strategy: translate logical argument to SMT-LIB, prove by refutation. + """ + + def __init__(self): + self.solver_timeout = 30 # seconds + + async def verify_chain( + self, + task_id: str, + translations: list[StepTranslation], + ) -> VerificationVerdict: + """ + For each step: + 1. Parse SMT-LIB from translation + 2. Assert premises + negation of conclusion + 3. If UNSAT → conclusion follows from premises (valid step) + 4. If SAT → conclusion does NOT follow (invalid step) + 5. If UNKNOWN → inconclusive + """ + step_verdicts = [] + + for trans in translations: + smt_code = trans.formal_representation + + result = await self._run_z3(smt_code) + + if result.status == "unsat": + # UNSAT means negated conclusion is impossible → step is valid + verified = True + error = None + elif result.status == "sat": + # SAT means there exists a counterexample → step is invalid + verified = False + error = f"Counterexample found: {result.model}" + else: + # UNKNOWN — solver couldn't decide + verified = False + error = f"Solver returned UNKNOWN after {self.solver_timeout}s" + + step_verdicts.append(StepVerdict( + step_id=trans.step_id, + verified=verified, + error_message=error, + details={"smt_status": result.status, "solver_time_ms": result.elapsed_ms}, + )) + + all_passed = all(sv.verified for sv in step_verdicts) + return VerificationVerdict( + task_id=task_id, + overall="VERIFIED" if all_passed else "PARTIAL" if any(sv.verified for sv in step_verdicts) else "FAILED", + step_verdicts=step_verdicts, + total_steps=len(translations), + verified_steps=sum(1 for sv in step_verdicts if sv.verified), + failure_points=[sv for sv in step_verdicts if not sv.verified], + ) + + async def _run_z3(self, smt_code: str) -> SMTResult: + """Execute Z3 on SMT-LIB input.""" + with tempfile.NamedTemporaryFile(suffix=".smt2", mode="w", delete=False) as f: + f.write(smt_code) + f.flush() + process = await asyncio.create_subprocess_exec( + "z3", f"-T:{self.solver_timeout}", f.name, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, stderr = await process.communicate() + os.unlink(f.name) + + output = stdout.decode().strip() + if output.startswith("unsat"): + return SMTResult(status="unsat", model=None) + elif output.startswith("sat"): + return SMTResult(status="sat", model=output) + else: + return SMTResult(status="unknown", model=None) +``` + +--- + +## 9. Step-Level Process Supervision + +### 9.1 — The Process Supervision Model + +This is the key differentiator from outcome-based verification. + +``` +Traditional (Outcome-based): + "Is the final answer correct?" → Yes/No + Problem: A correct answer can come from wrong reasoning (lucky guess) + Problem: Can't identify WHERE reasoning breaks down + +ReasonForge (Process-based): + "Is each individual reasoning step provably correct?" → [Yes, Yes, No, Yes, ...] + Benefit: Identifies exact failure point + Benefit: Correct process guarantees correct outcome + Benefit: Partial verification is still valuable +``` + +### 9.2 — Dependency Graph + +Reasoning steps aren't linear — step 5 might depend on steps 2 and 3 but not 4. +Miners declare dependencies. Validators verify the dependency graph is valid. + +```python +@dataclass +class StepDependencyGraph: + """DAG of reasoning step dependencies.""" + + steps: dict[int, StepTranslation] # step_id → translation + edges: dict[int, list[int]] # step_id → [dependency step_ids] + + def validate_dag(self) -> bool: + """Check: no cycles, all dependencies exist, topological order valid.""" + ... + + def get_verification_order(self) -> list[int]: + """Return topological sort — verify leaves first, then dependents.""" + ... + + def invalidation_cascade(self, failed_step: int) -> set[int]: + """If step N fails, which downstream steps are also invalid?""" + ... +``` + +### 9.3 — Failure Localization + +```python +@dataclass +class FailureReport: + """Detailed report when verification fails.""" + + failed_step_id: int + original_reasoning: str # What the AI said + formal_translation: str # What the miner translated it to + verification_error: str # Why it failed (compiler/solver output) + suggested_fix: Optional[str] # If the error is common, suggest a fix + cascade_impact: list[int] # Which downstream steps are invalidated + last_valid_step: int # The deepest step that still holds + partial_correctness: float # Fraction of chain that is verified (0.0 - 1.0) +``` + +This is what enterprises pay for. Not "your reasoning scored 0.87" but +"Steps 1-4 are mathematically proven correct. Step 5 contains an error: +the substitution of x=2k assumes k is positive, but k could be negative. +Steps 6-8 depend on Step 5 and are therefore unverified." + +--- + +## 10. Verification Verdicts & Failure Localization + +### 10.1 — Verdict Schema + +```python +@dataclass +class VerificationVerdict: + """The core output of the verification pipeline.""" + + task_id: str + overall: str # "VERIFIED" | "PARTIAL" | "FAILED" + + # Per-step results + step_verdicts: list[StepVerdict] + total_steps: int + verified_steps: int + + # Failure analysis + failure_points: list[StepVerdict] # Steps that failed + failure_report: Optional[FailureReport] = None + + # Metadata + domain: str # "mathematics" | "code" | "logic" + proof_level: str # "formal" | "standard" | "quick" + verification_time_ms: int = 0 + + # Raw artifacts (for audit) + raw_output: Optional[str] = None # Lean4 compiler output, test logs, etc. + + # Translators involved (miner UIDs) + translator_uids: list[int] = field(default_factory=list) + + # For certificate generation + verdict_hash: str = "" # SHA-256 of canonical verdict representation + + def compute_verdict_hash(self) -> str: + """Deterministic hash of the verdict for ZK proof input.""" + canonical = json.dumps({ + "task_id": self.task_id, + "overall": self.overall, + "steps": [ + {"id": sv.step_id, "verified": sv.verified} + for sv in self.step_verdicts + ], + }, sort_keys=True) + return hashlib.sha256(canonical.encode()).hexdigest() + +@dataclass +class StepVerdict: + """Verification result for a single reasoning step.""" + + step_id: int + verified: bool # Binary: proved or not + error_message: Optional[str] # If failed, why + formal_representation: str # The formal translation that was checked + details: dict = field(default_factory=dict) # Backend-specific details +``` + +--- + +# PART III — ZK VERIFICATION CERTIFICATES + +--- + +## 11. Certificate Schema + +### 11.1 — What Goes Into a Certificate + +```python +@dataclass +class VerificationCertificate: + """ + A ZK-SNARK proof that reasoning was formally verified, + without revealing the reasoning itself. + """ + + # Identity + certificate_id: str # Unique ID (hash of contents) + version: int = 1 # Schema version + + # What was verified (public inputs to ZK circuit) + task_hash: str # SHA-256 of original query + reasoning chain + domain: str # "mathematics" | "code" | "logic" + proof_level: str # "formal" | "standard" | "quick" + total_steps: int # Number of reasoning steps + verified_steps: int # Number that passed verification + overall_verdict: str # "VERIFIED" | "PARTIAL" | "FAILED" + timestamp: int # Unix timestamp + + # Who verified (public inputs) + validator_count: int # Number of validators that participated + validator_threshold: int # Minimum required (e.g., 3 of 5) + validator_commitment: str # Merkle root of validator public keys + + # The proof itself + zk_proof: bytes # SNARK proof bytes + verification_key: str # Reference to verification key + + # On-chain registration + chain_id: int # EVM chain ID + registry_address: str # Certificate registry contract address + tx_hash: Optional[str] # Registration transaction hash + block_number: Optional[int] # Block number of registration + + # Verification URL + verify_url: str # Public URL to verify certificate +``` + +### 11.2 — What the ZK Proof Proves (Without Revealing) + +The proof attests to ALL of the following without revealing any details: + +``` +1. A reasoning chain of N steps was submitted +2. Each step was translated into a formal representation by M independent miners +3. A formal verification tool (Lean4 / sandbox / Z3) was run on each translation +4. K out of N steps compiled/passed/proved successfully +5. V out of W validators independently confirmed the verification +6. The verification was performed after timestamp T +7. The validators' combined stake exceeds threshold S +``` + +What remains HIDDEN: +``` +- The original reasoning chain (privacy) +- The formal translations (IP protection) +- Individual validator scores (anonymity) +- The specific model that produced the reasoning (model-agnostic) +- Any proprietary data in the query (confidentiality) +``` + +--- + +## 12. ZK Circuit Design + +### 12.1 — Circuit Architecture + +We use Groth16 (for constant-size proofs) via circom or Halo2 (for recursive composition). + +``` +Circuit: ReasoningVerificationProof + +Public Inputs: + - task_hash: Field (SHA-256 truncated to field element) + - verdict_hash: Field (SHA-256 of verification verdict) + - validator_root: Field (Merkle root of validator commitments) + - min_validators: uint32 (threshold) + - timestamp: uint64 + +Private Inputs (witness): + - step_verdicts: bool[MAX_STEPS] (per-step pass/fail) + - validator_signatures: Sig[MAX_VALS] (BLS/EdDSA signatures on verdict_hash) + - validator_pubkeys: PubKey[MAX_VALS] + - validator_stakes: uint64[MAX_VALS] + - merkle_proofs: MerkleProof[MAX_VALS] (prove validators are in commitment tree) + +Constraints: + 1. verdict_hash == SHA256(step_verdicts) + 2. For each validator i: + a. VerifySignature(validator_signatures[i], verdict_hash, validator_pubkeys[i]) == true + b. MerkleVerify(validator_pubkeys[i], merkle_proofs[i], validator_root) == true + 3. count(valid_signatures) >= min_validators + 4. sum(validator_stakes where valid) >= STAKE_THRESHOLD +``` + +### 12.2 — Implementation Choice + +```python +# We implement the ZK layer using one of: + +# Option A: Circom + SnarkJS (JavaScript-friendly, well-documented) +# Pros: Large community, easy to deploy verifier on EVM +# Cons: Trusted setup, limited circuit flexibility + +# Option B: Halo2 (Rust, recursive-friendly) +# Pros: No trusted setup, recursive composition native, high performance +# Cons: Smaller community, steeper learning curve + +# Option C: SP1 / Risc0 (zkVM approach) +# Pros: Write verification logic in Rust, compiled to ZK circuit automatically +# Cons: Larger proof size, slower proving + +# RECOMMENDED: Halo2 for recursive proofs + SP1 for complex verification logic +# This matches Xythum's existing ZK infrastructure +``` + +### 12.3 — Circuit Parameters + +```python +# Circuit constraints budget +MAX_REASONING_STEPS = 32 # Max steps per verification +MAX_VALIDATORS = 16 # Max validators per certificate +MERKLE_DEPTH = 10 # Supports up to 1024 validators in tree +HASH_FUNCTION = "Poseidon" # ZK-friendly hash (not SHA-256 in circuit) + +# Proof generation estimates +PROVING_TIME_SECONDS = 10-30 # On modern CPU +PROOF_SIZE_BYTES = 256 # Groth16 constant size +VERIFICATION_TIME_MS = 2-5 # On-chain verification +VERIFICATION_GAS = ~250_000 # EVM gas cost +``` + +--- + +## 13. Recursive Proof Composition + +### 13.1 — Why Recursive + +A 10-step reasoning chain doesn't need 10 separate proofs. With recursive composition: + +``` +Step 1 proof: "Step 1 is verified" +Step 2 proof: "Step 2 is verified AND I've verified the proof that Step 1 is verified" +Step 3 proof: "Step 3 is verified AND I've verified the proof that Steps 1-2 are verified" +... +Step N proof: "Step N is verified AND I've verified the proof that Steps 1-(N-1) are verified" +``` + +The final proof is a SINGLE constant-size proof that the ENTIRE chain is valid. + +### 13.2 — Recursive Circuit + +``` +RecursiveVerificationCircuit: + +Public Inputs: + - accumulated_verdict_hash: Field (running hash of all step verdicts) + - step_count: uint32 (how many steps verified so far) + - chain_commitment: Field (commitment to the full chain) + +Private Inputs: + - previous_proof: Proof (proof for steps 1..N-1) + - current_step_verdict: bool (did step N pass?) + - current_step_formal: Field (hash of formal representation) + - verification_output: Field (hash of verifier output) + +Constraints: + 1. Verify(previous_proof, previous_public_inputs) == true + 2. accumulated_verdict_hash == Poseidon( + previous_accumulated_hash, current_step_verdict, current_step_formal + ) + 3. step_count == previous_step_count + 1 +``` + +### 13.3 — Implementation + +```python +class RecursiveProver: + """ + Compose step-level verification proofs into a single recursive proof. + Uses Halo2 IVC (Incrementally Verifiable Computation). + """ + + def __init__(self, params_path: str): + self.params = load_params(params_path) + + async def prove_chain( + self, + step_verdicts: list[StepVerdict], + validator_commitments: list[ValidatorCommitment], + ) -> VerificationCertificate: + """ + Build recursive proof from step verdicts. + + Process: + 1. Base case: prove step 1 + 2. For each subsequent step: fold in step N proof with previous accumulator + 3. Final: wrap in certificate with validator attestations + """ + + # Base case + accumulator = await self._prove_base(step_verdicts[0]) + + # Recursive folding + for verdict in step_verdicts[1:]: + accumulator = await self._fold_step(accumulator, verdict) + + # Add validator attestations + final_proof = await self._finalize( + accumulator, validator_commitments + ) + + # Construct certificate + return VerificationCertificate( + certificate_id=self._compute_id(final_proof), + zk_proof=final_proof.to_bytes(), + total_steps=len(step_verdicts), + verified_steps=sum(1 for v in step_verdicts if v.verified), + overall_verdict=self._compute_verdict(step_verdicts), + ... + ) +``` + +--- + +## 14. On-Chain Certificate Registry + +### 14.1 — Smart Contract (Solidity) + +```solidity +// SPDX-License-Identifier: MIT +pragma solidity ^0.8.24; + +import "./Verifier.sol"; // Auto-generated Groth16/Halo2 verifier + +contract ReasonForgeCertificateRegistry { + + struct Certificate { + bytes32 taskHash; // Hash of the verified reasoning + bytes32 verdictHash; // Hash of verification verdict + uint8 domain; // 0=math, 1=code, 2=logic + uint8 proofLevel; // 0=formal, 1=standard, 2=quick + uint16 totalSteps; + uint16 verifiedSteps; + uint8 verdict; // 0=VERIFIED, 1=PARTIAL, 2=FAILED + uint64 timestamp; + uint8 validatorCount; + address registrant; // Who registered (validator's address) + } + + // State + mapping(bytes32 => Certificate) public certificates; // certId → cert + mapping(bytes32 => bool) public proofVerified; // certId → was ZK proof valid + Verifier public immutable verifier; // ZK proof verifier + + uint256 public totalCertificates; + uint256 public totalVerified; + + // Events + event CertificateRegistered( + bytes32 indexed certificateId, + bytes32 indexed taskHash, + uint8 verdict, + uint16 verifiedSteps, + uint16 totalSteps + ); + + constructor(address _verifier) { + verifier = Verifier(_verifier); + } + + /** + * @notice Register a new verification certificate with ZK proof + * @param certId Unique certificate identifier + * @param cert Certificate metadata + * @param proof ZK-SNARK proof bytes + * @param publicInputs Public inputs to the ZK circuit + */ + function registerCertificate( + bytes32 certId, + Certificate calldata cert, + bytes calldata proof, + uint256[] calldata publicInputs + ) external { + require(certificates[certId].timestamp == 0, "Certificate exists"); + + // Verify ZK proof on-chain + bool valid = verifier.verify(proof, publicInputs); + require(valid, "Invalid ZK proof"); + + // Store + certificates[certId] = cert; + proofVerified[certId] = true; + totalCertificates++; + + if (cert.verdict == 0) { // VERIFIED + totalVerified++; + } + + emit CertificateRegistered( + certId, cert.taskHash, cert.verdict, + cert.verifiedSteps, cert.totalSteps + ); + } + + /** + * @notice Check if reasoning has a valid verification certificate + * @param taskHash Hash of the reasoning chain to check + * @return True if a VERIFIED certificate exists + */ + function isVerified(bytes32 taskHash) external view returns (bool) { + // Search for certificate by task hash + // In production: use a taskHash → certId mapping + ... + } + + /** + * @notice Anyone can verify a certificate's ZK proof + * @param certId Certificate to verify + * @return True if the ZK proof is valid + */ + function verifyCertificate(bytes32 certId) external view returns (bool) { + return proofVerified[certId]; + } +} +``` + +### 14.2 — Deployment Targets + +``` +Primary: EVM (Ethereum L2 — Arbitrum or Base for low gas costs) +Secondary: Bittensor EVM (when available) +Future: Solana, Cosmos IBC +``` + +--- + +## 15. Certificate Verification Contract + +The Verifier.sol is auto-generated from the ZK circuit. For Groth16: + +```solidity +// Auto-generated by snarkjs or halo2 export +contract Verifier { + // Verification key embedded as constants + uint256 constant ALPHA_X = ...; + uint256 constant ALPHA_Y = ...; + // ... (elliptic curve points) + + function verify( + bytes calldata proof, + uint256[] calldata publicInputs + ) external view returns (bool) { + // Pairing check + // Gas cost: ~250,000 + ... + } +} +``` + +--- + +# PART IV — BITTENSOR INTEGRATION (REVISED) + +--- + +## 16. New Synapse Protocol + +### 16.1 — TranslationTask Synapse (Validator → Miner) + +```python +class TranslationTask(bt.Synapse): + """ + Validator sends NL reasoning chain to miner for formal translation. + This REPLACES the old ReasoningTask synapse. + """ + + # Immutable (validator sets) + task_id: str + original_query: str + reasoning_chain: list[dict] # [{step_id, content, claimed_conclusion}] + domain: str # "mathematics" | "code" | "logic" + difficulty: int + proof_level: str # "formal" | "standard" | "quick" + timeout_seconds: int = 300 + + # Mutable (miner fills) + translations: Optional[list[dict]] = None # StepTranslation objects as dicts + compilation_status: Optional[str] = None # "COMPILED" | "PARTIAL" | "FAILED" + full_proof_artifact: Optional[str] = None # Base64 of complete proof file + translation_time_ms: Optional[int] = None + submission_hash: Optional[str] = None + + required_hash_fields: list[str] = [ + "task_id", "original_query", "domain", "difficulty", "proof_level" + ] + + def deserialize(self) -> dict: + return { + "translations": self.translations or [], + "compilation_status": self.compilation_status, + "full_proof_artifact": self.full_proof_artifact, + "translation_time_ms": self.translation_time_ms, + "submission_hash": self.submission_hash, + } +``` + +### 16.2 — VerificationResult Synapse (Validator → Miner, informational) + +```python +class VerificationResult(bt.Synapse): + """Validator notifies miner of their verification results.""" + + epoch_id: int + miner_uid: int + tasks_translated: int + steps_compiled: int # How many of their translations compiled + steps_total: int + compilation_rate: float # steps_compiled / steps_total + epoch_score: float + rank: int + tao_earned: float + + required_hash_fields: list[str] = ["epoch_id", "miner_uid"] +``` + +--- + +## 17. Miner Role: Translator + +### 17.1 — Translator Miner Architecture + +```python +class TranslatorMiner(BaseNeuron): + """ + Miners in the Proof Layer DON'T solve problems. + They TRANSLATE natural language reasoning into formal proofs. + + This requires: + 1. Understanding the reasoning (comprehension) + 2. Knowing the target formal language (Lean4/Python/SMT-LIB) + 3. Producing compilable/executable output (precision) + """ + + def __init__(self, config): + super().__init__(config) + + self.translator = TranslationEngine( + backend=self.config.miner.backend, + model=self.config.miner.model, + domains=self.config.miner.domains, + ) + + # Local compilation check (optional but improves score) + self.local_lean = LocalLeanChecker() if "mathematics" in self.config.miner.domains else None + self.local_python = LocalPythonChecker() if "code" in self.config.miner.domains else None + self.local_z3 = LocalZ3Checker() if "logic" in self.config.miner.domains else None + + async def handle_translation_task(self, synapse: TranslationTask) -> TranslationTask: + """ + Core handler: + 1. Parse the reasoning chain + 2. For each step, generate formal translation + 3. Optionally verify locally before submitting + 4. Return translations + """ + start = time.time_ns() + + try: + translations = [] + for step in synapse.reasoning_chain: + # Generate formal translation using LLM + formal = await self.translator.translate_step( + step=step, + domain=synapse.domain, + proof_level=synapse.proof_level, + previous_steps=[t for t in translations], # Context + original_query=synapse.original_query, + ) + + # Optional: local compilation check + if self.local_lean and synapse.domain == "mathematics": + formal.compilation_check = await self.local_lean.quick_check( + formal.formal_representation + ) + + translations.append(formal) + + # Fill Synapse + synapse.translations = [asdict(t) for t in translations] + synapse.compilation_status = self._assess_compilation(translations) + synapse.full_proof_artifact = self._build_full_artifact(translations, synapse.domain) + synapse.translation_time_ms = int((time.time_ns() - start) / 1_000_000) + synapse.submission_hash = self._compute_hash(synapse) + + except Exception as e: + bt.logging.error(f"Translation failed: {e}") + synapse.translations = [] + synapse.compilation_status = "FAILED" + + return synapse +``` + +### 17.2 — Translation Engine + +```python +class TranslationEngine: + """ + Uses LLM to translate NL reasoning steps into formal representations. + The system prompt is critical — it determines translation quality. + """ + + MATH_SYSTEM_PROMPT = """You are a Lean 4 formalization expert. +Your task: translate a natural language mathematical reasoning step into Lean 4 code. + +Rules: +1. Each step becomes a `theorem` or `lemma` with explicit type signature +2. Use `by` tactic blocks. Prefer: ring, linarith, omega, simp, norm_num, exact +3. Declare all assumptions as hypotheses in the type signature +4. Reference previous steps by importing their theorems +5. If a step is an assumption/given, use `axiom` or `variable` +6. MUST compile independently. Test mentally before submitting. +7. Include type annotations for all variables +8. Add comments mapping back to the natural language + +Output format: +```lean +-- NL: "{original natural language step}" +-- Dependencies: Step {N}, Step {M} + +import ReasonForge.Step{N} +import ReasonForge.Step{M} + +theorem step_{current_id} + (h1 : {type from step N}) + (h2 : {type from step M}) + : {conclusion type} := by + {tactic proof} +``` + +Do NOT use sorry. If you cannot prove it, say so explicitly.""" + + CODE_SYSTEM_PROMPT = """You are a code verification expert. +Your task: translate a natural language code reasoning step into executable Python with tests. + +Rules: +1. Each step produces: implementation + unit tests + property tests +2. Use hypothesis library for property-based testing +3. Include type hints on all functions +4. Tests must be runnable with pytest +5. If the step claims O(n) complexity, include a statistical timing test +6. Cover edge cases: empty input, single element, large input, negative numbers +7. Each test function must have a clear docstring explaining what it verifies + +Output format: +```python +# NL: "{original natural language step}" +# Verifies: {what this step claims} + +def {function_name}({params}: {types}) -> {return_type}: + \"\"\"Implementation of step {N}.\"\"\" + ... + +def test_{function_name}_basic(): + \"\"\"Verify basic correctness.\"\"\" + assert ... + +def test_{function_name}_edge_cases(): + \"\"\"Verify edge case handling.\"\"\" + assert ... + +@given(...) +def test_{function_name}_properties(): + \"\"\"Verify claimed properties hold for all inputs.\"\"\" + assert ... +```""" + + LOGIC_SYSTEM_PROMPT = """You are a formal logic expert. +Your task: translate a natural language logical reasoning step into SMT-LIB format. + +Rules: +1. Declare all sorts, functions, and predicates +2. Assert all premises explicitly +3. To prove a conclusion: assert its NEGATION and check for UNSAT +4. Use quantifiers (forall, exists) where appropriate +5. Keep sorts minimal — only declare what's needed +6. Include comments mapping back to natural language + +Output format: +```smt2 +; NL: "{original natural language step}" +; Proves: {conclusion} follows from {premises} + +(set-logic ALL) + +; Declarations +(declare-sort ...) +(declare-fun ...) + +; Premises (from previous steps) +(assert ...) + +; Negated conclusion (proof by refutation) +(assert (not ...)) + +(check-sat) +; Expected: unsat (conclusion is valid) +```""" + + async def translate_step( + self, + step: dict, + domain: str, + proof_level: str, + previous_steps: list, + original_query: str, + ) -> StepTranslation: + """Use LLM to translate one reasoning step.""" + + # Select system prompt based on domain + system_prompt = { + "mathematics": self.MATH_SYSTEM_PROMPT, + "code": self.CODE_SYSTEM_PROMPT, + "logic": self.LOGIC_SYSTEM_PROMPT, + }[domain] + + # Build context from previous steps + context = self._build_context(previous_steps, original_query) + + # Call LLM + response = await self.backend.generate_structured( + messages=[ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": f""" +Original problem: {original_query} + +Previous verified steps: +{context} + +Current step to translate: +Step {step['step_id']}: {step['content']} +Claimed conclusion: {step.get('claimed_conclusion', 'N/A')} + +Translate this step into {domain} formal representation. +"""}, + ], + schema=StepTranslationSchema, + timeout=60, + ) + + return StepTranslation( + step_id=step["step_id"], + original_content=step["content"], + formal_representation=response["formal_representation"], + dependencies=response.get("dependencies", []), + translation_confidence=response.get("confidence", 0.5), + compilation_check=False, # Will be set by local checker + ) +``` + +--- + +## 18. Validator Role: Verifier + +### 18.1 — Revised Validator Epoch Loop + +```python +class ProofLayerValidator(BaseNeuron): + """ + Validators in the Proof Layer: + 1. Receive verification requests (from API or generated) + 2. Dispatch to multiple translators (miners) + 3. Run MECHANICAL verification on returned translations + 4. Compare: multiple independent translations should verify the SAME steps + 5. Generate verification verdict + 6. Produce ZK certificate + 7. Set on-chain weights based on translation quality + """ + + def run_epoch(self): + self.epoch_id += 1 + + # Phase A: Get verification requests + requests = self.task_manager.get_epoch_tasks() + + # Phase B: For each request, run verification pipeline + results = [] + for req in requests: + result = asyncio.run(self.verify_reasoning(req)) + results.append(result) + + # Phase C: Score miners based on translation quality + self.score_translators(results) + + # Phase D: Set weights + self.set_weights() + + # Phase E: Generate certificates for verified chains + for result in results: + if result.verdict.overall in ("VERIFIED", "PARTIAL"): + cert = asyncio.run(self.generate_certificate(result)) + self.register_certificate(cert) + + # Phase F: Persist & notify + self.save_state() + asyncio.run(self.notify_miners()) + + async def verify_reasoning(self, request: VerificationRequest) -> VerificationResult: + """ + Full verification pipeline for one reasoning chain. + """ + # 1. Select translators (miners) + miner_uids = self.select_translators(request.domain, n=5) + + # 2. Send translation task to each + synapse = TranslationTask( + task_id=request.task_id, + original_query=request.original_query, + reasoning_chain=request.reasoning_chain, + domain=request.domain, + difficulty=request.difficulty, + proof_level=request.proof_level, + ) + + axons = [self.metagraph.axons[uid] for uid in miner_uids] + responses = await self.dendrite( + axons=axons, synapse=synapse, timeout=request.timeout + ) + + # 3. For each miner's translation, run mechanical verification + miner_verdicts = {} + for uid, response in zip(miner_uids, responses): + if not response.translations: + miner_verdicts[uid] = None + continue + + translations = [StepTranslation(**t) for t in response.translations] + + # Run the appropriate verifier + if request.domain == "mathematics": + verdict = await self.lean4_verifier.verify_chain( + request.task_id, translations, request.original_query + ) + elif request.domain == "code": + verdict = await self.code_verifier.verify_chain( + request.task_id, translations, request.original_query + ) + elif request.domain == "logic": + verdict = await self.fol_verifier.verify_chain( + request.task_id, translations + ) + + miner_verdicts[uid] = verdict + + # 4. Cross-validate: compare verdicts across miners + # Multiple independent translations should agree on which steps are valid + consensus_verdict = self.cross_validate(miner_verdicts) + + # 5. Score each miner based on their translation quality + miner_scores = {} + for uid, verdict in miner_verdicts.items(): + if verdict is None: + miner_scores[uid] = 0.0 + else: + miner_scores[uid] = self.score_translation(verdict, consensus_verdict) + + return VerificationResult( + request=request, + verdict=consensus_verdict, + miner_verdicts=miner_verdicts, + miner_scores=miner_scores, + ) + + def cross_validate(self, miner_verdicts: dict) -> VerificationVerdict: + """ + Compare N independent translations to build consensus. + + If 3 out of 5 miners produce translations that verify step K, + step K is considered verified. This handles: + - Translation errors (one miner mistranslates) + - Model limitations (one miner's LLM can't formalize a step) + - Adversarial miners (one miner submits garbage) + """ + valid_verdicts = {uid: v for uid, v in miner_verdicts.items() if v is not None} + if not valid_verdicts: + return VerificationVerdict(overall="FAILED", ...) + + # For each step, count how many miners got it to verify + step_counts = defaultdict(int) + step_total = defaultdict(int) + + for uid, verdict in valid_verdicts.items(): + for sv in verdict.step_verdicts: + step_total[sv.step_id] += 1 + if sv.verified: + step_counts[sv.step_id] += 1 + + # Step is verified if majority of miners got it to compile + threshold = len(valid_verdicts) / 2 + consensus_steps = [] + for step_id in sorted(step_total.keys()): + verified = step_counts[step_id] > threshold + consensus_steps.append(StepVerdict( + step_id=step_id, + verified=verified, + error_message=None if verified else f"Only {step_counts[step_id]}/{step_total[step_id]} translations compiled", + )) + + all_verified = all(s.verified for s in consensus_steps) + any_verified = any(s.verified for s in consensus_steps) + + return VerificationVerdict( + overall="VERIFIED" if all_verified else ("PARTIAL" if any_verified else "FAILED"), + step_verdicts=consensus_steps, + total_steps=len(consensus_steps), + verified_steps=sum(1 for s in consensus_steps if s.verified), + failure_points=[s for s in consensus_steps if not s.verified], + ) +``` + +--- + +## 19. Revised Incentive Mechanism + +### 19.1 — New Scoring Dimensions + +Replace the old CMS (Quality, Accuracy, Novelty, Efficiency) with dimensions +that measure TRANSLATION quality: + +```python +# ── New Composite Translation Score (CTS) ── +# Replaces CMS (Eq. 2) + +W_COMPILATION = 0.45 # Did the translation compile/execute? +W_CORRECTNESS = 0.30 # Did verification pass? (binary per step) +W_COMPLETENESS = 0.15 # What fraction of steps were translated? +W_EFFICIENCY = 0.10 # Translation time relative to timeout + +# CTS(m, t) = 0.45·Compilation + 0.30·Correctness + 0.15·Completeness + 0.10·Efficiency + +class TranslationScorer: + """Score miners based on their formal translations.""" + + @staticmethod + def compute_cts( + translations: list[StepTranslation], + verdict: VerificationVerdict, + total_steps: int, + time_ms: int, + timeout_ms: int, + ) -> float: + """Composite Translation Score — replaces CMS.""" + + if not translations: + return 0.0 + + # Compilation: what fraction of translations compiled? + compiled = sum(1 for t in translations if t.compilation_check) / len(translations) + + # Correctness: what fraction of verified steps match consensus? + if verdict and verdict.step_verdicts: + correct = sum(1 for sv in verdict.step_verdicts if sv.verified) / len(verdict.step_verdicts) + else: + correct = 0.0 + + # Completeness: did the miner translate all steps? + completeness = len(translations) / max(1, total_steps) + + # Efficiency: faster is better (but not suspiciously fast) + time_ratio = time_ms / max(1, timeout_ms) + if time_ratio < 0.02: # Suspiciously fast (< 2% of timeout) + efficiency = 0.1 + elif time_ratio > 1.0: # Timed out + efficiency = 0.0 + else: + efficiency = 1.0 - (time_ratio * 0.4) + + return ( + W_COMPILATION * compiled + + W_CORRECTNESS * correct + + W_COMPLETENESS * min(1.0, completeness) + + W_EFFICIENCY * efficiency + ) +``` + +### 19.2 — Equations That Stay the Same + +These work identically — just swap CMS for CTS: + +``` +Eq. 3 (S_epoch): S_epoch(m) = avg(CTS(m,t) · D(t)) · trap_penalty — SAME formula, CTS replaces CMS +Eq. 4 (PEB): PEB(m) = α · (1/rank) · √streak — UNCHANGED +Eq. 5 (Emission): R(m) = E_miner · [S·(1+PEB)] / Σ[S·(1+PEB)] — UNCHANGED +Eq. 9 (Trap): trap_penalty = avg_trap_score / θ if below — UNCHANGED +Eq. 10 (Slash): slash(v) = γ · stake · (θ - VAS)² — UNCHANGED +``` + +### 19.3 — Revised Trap Problems + +Traps are now reasoning chains with KNOWN formal translations: + +```python +# Trap for mathematics domain +trap_task = { + "reasoning_chain": [ + {"step_id": 1, "content": "Let n be an arbitrary integer"}, + {"step_id": 2, "content": "Since n² ≥ 0 for all integers, we have n² + 1 > 0"}, + {"step_id": 3, "content": "Therefore n² + 1 ≠ 0, so 1/(n²+1) is well-defined"}, + ], + "ground_truth_translations": { + 1: "variable (n : ℤ)", + 2: "theorem step2 (n : ℤ) : n^2 + 1 > 0 := by positivity", + 3: "theorem step3 (n : ℤ) : n^2 + 1 ≠ 0 := by linarith [sq_nonneg n]", + }, + "ground_truth_verdict": "VERIFIED", # All 3 steps should verify +} +``` + +--- + +## 20. Weight Computation + +Weights sent to chain are the same formula as before, just driven by CTS instead of CMS: + +```python +def compute_weights(self, miner_states: dict[int, MinerState], n: int): + """ + Map CTS-based epoch scores → on-chain weight vector. + Yuma Consensus on-chain then determines actual TAO emissions. + """ + weights = torch.zeros(n) + for uid, state in miner_states.items(): + weights[uid] = state.s_epoch * (1.0 + state.peb) + + # Normalize + if weights.sum() > 0: + weights = weights / weights.sum() + + return weights +``` + +--- + +# PART V — ENTERPRISE API PRODUCT + +--- + +## 21. Verification-as-a-Service API + +### 21.1 — Endpoints + +``` +POST /v1/verify — Submit reasoning chain for verification +GET /v1/verify/{task_id} — Poll for verification result +GET /v1/certificates/{cert_id} — Get certificate details +POST /v1/certificates/{cert_id}/verify — Verify certificate ZK proof (off-chain) +GET /v1/stats — Network statistics +WS /v1/stream — Real-time verification updates +``` + +### 21.2 — Request/Response + +```python +# Request +class VerifyRequest(BaseModel): + reasoning_chain: list[ReasoningStep] # The AI's reasoning to verify + domain: Literal["mathematics", "code", "logic"] + original_query: str # What was asked + claimed_answer: str # What the AI concluded + proof_level: Literal["formal", "standard", "quick"] = "standard" + callback_url: Optional[str] = None # Webhook on completion + generate_certificate: bool = True # Generate ZK certificate? + +class ReasoningStep(BaseModel): + step_id: int + content: str # Natural language reasoning + claimed_conclusion: Optional[str] = None + +# Response +class VerifyResponse(BaseModel): + task_id: str + status: str # "queued" | "translating" | "verifying" | "complete" + verdict: Optional[str] = None # "VERIFIED" | "PARTIAL" | "FAILED" + steps_verified: Optional[int] = None + steps_total: Optional[int] = None + failure_points: Optional[list[FailureDetail]] = None + certificate_id: Optional[str] = None + certificate_url: Optional[str] = None + verification_time_ms: Optional[int] = None + cost_credits: Optional[float] = None # Credits consumed + +class FailureDetail(BaseModel): + step_id: int + original_content: str + error: str + suggested_fix: Optional[str] = None + cascade_impact: list[int] # Steps invalidated by this failure +``` + +### 21.3 — Pricing Model + +```python +PRICING = { + "formal": { + "per_step": 0.50, # $0.50 per reasoning step + "base_fee": 2.00, # $2.00 base + "certificate_fee": 1.00, # $1.00 for ZK certificate + }, + "standard": { + "per_step": 0.10, + "base_fee": 0.50, + "certificate_fee": 0.50, + }, + "quick": { + "per_step": 0.02, + "base_fee": 0.10, + "certificate_fee": 0.25, + }, +} + +# Example: 10-step mathematical proof, formal level +# Cost = $2.00 + (10 × $0.50) + $1.00 = $8.00 + +# Free tier: 50 quick verifications per month +# Pro tier: $99/mo, 500 standard verifications included +# Enterprise tier: Custom pricing, SLA, dedicated validators +``` + +--- + +## 22. SDK + +### 22.1 — Python SDK + +```python +from reasonforge import ReasonForge + +rf = ReasonForge(api_key="rf_...") + +# Verify a reasoning chain +result = rf.verify( + reasoning_chain=[ + {"step_id": 1, "content": "Let x = 3 and y = 4"}, + {"step_id": 2, "content": "By Pythagorean theorem, x² + y² = z²"}, + {"step_id": 3, "content": "So z² = 9 + 16 = 25"}, + {"step_id": 4, "content": "Therefore z = 5"}, + ], + domain="mathematics", + original_query="Find the hypotenuse of a right triangle with legs 3 and 4", + claimed_answer="z = 5", + proof_level="formal", +) + +print(result.verdict) # "VERIFIED" +print(result.certificate_url) # https://verify.reasonforge.ai/0xabc... + +# Check if a step failed +if result.verdict == "PARTIAL": + for failure in result.failure_points: + print(f"Step {failure.step_id} failed: {failure.error}") + print(f" Last valid step: {failure.step_id - 1}") +``` + +### 22.2 — TypeScript SDK + +```typescript +import { ReasonForge } from '@reasonforge/sdk'; + +const rf = new ReasonForge({ apiKey: 'rf_...' }); + +const result = await rf.verify({ + reasoningChain: [ + { stepId: 1, content: 'Initialize empty hash map' }, + { stepId: 2, content: 'Iterate through array, count frequencies' }, + { stepId: 3, content: 'Return the key with maximum count' }, + ], + domain: 'code', + originalQuery: 'Find the most frequent element in an array', + claimedAnswer: 'Use hash map for O(n) solution', + proofLevel: 'standard', +}); + +console.log(result.verdict); // "VERIFIED" +``` + +### 22.3 — Model Provider Integration (OpenAI wrapper example) + +```python +import openai +from reasonforge import ReasonForge + +rf = ReasonForge(api_key="rf_...") +client = openai.OpenAI() + +# Step 1: Get reasoning from OpenAI +response = client.chat.completions.create( + model="o1-preview", + messages=[{"role": "user", "content": "Prove that √2 is irrational"}], +) + +# Step 2: Parse reasoning steps (o1 returns them in thinking) +steps = rf.parse_reasoning_chain(response) # Auto-extract from model output + +# Step 3: Verify +verification = rf.verify( + reasoning_chain=steps, + domain="mathematics", + original_query="Prove that √2 is irrational", + claimed_answer=response.choices[0].message.content, + proof_level="formal", +) + +# Step 4: Attach certificate to your application's response +if verification.verdict == "VERIFIED": + print(f"✓ Reasoning formally verified: {verification.certificate_url}") +else: + print(f"⚠ Verification failed at step {verification.failure_points[0].step_id}") +``` + +--- + +## 23. Model Provider Integrations + +Build first-party integrations for the top model providers: + +```python +# reasonforge/integrations/openai.py +class OpenAIIntegration: + """Auto-extract reasoning chains from o1/o3 thinking tokens.""" + def parse(self, response) -> list[ReasoningStep]: ... + +# reasonforge/integrations/anthropic.py +class AnthropicIntegration: + """Auto-extract reasoning from Claude's extended thinking.""" + def parse(self, response) -> list[ReasoningStep]: ... + +# reasonforge/integrations/deepseek.py +class DeepSeekIntegration: + """Auto-extract reasoning from DeepSeek R1 traces.""" + def parse(self, response) -> list[ReasoningStep]: ... + +# reasonforge/integrations/langchain.py +class LangChainIntegration: + """Middleware that auto-verifies LangChain agent reasoning chains.""" + def as_callback(self) -> BaseCallbackHandler: ... +``` + +--- + +## 24. Compliance Report Generator + +For enterprise customers who need audit-ready documentation: + +```python +class ComplianceReportGenerator: + """ + Generate PDF compliance reports from verification results. + Suitable for regulatory submission (EU AI Act, FDA, SEC). + """ + + def generate(self, verification_result, template="eu_ai_act") -> bytes: + """ + Report includes: + - Original query and reasoning chain + - Per-step verification verdict with formal proofs + - Failure analysis (if any) + - Certificate reference (on-chain tx hash) + - Validator participation details + - Methodology description (suitable for auditor) + - Timestamp and chain-of-custody proof + """ + ... +``` + +--- + +# PART VI — BUILD ORDER & MILESTONES + +--- + +## 25. Directory Structure + +``` +reasonforge/ +├── [All MVP + Production files from previous plans] +│ +├── reasonforge/ +│ ├── [Existing modules] +│ │ +│ ├── translation/ # [NEW] NL-to-Formal pipeline +│ │ ├── __init__.py +│ │ ├── engine.py # TranslationEngine (LLM-powered) +│ │ ├── prompts.py # Domain-specific system prompts +│ │ ├── parsers.py # Parse LLM output into StepTranslation +│ │ └── types.py # TranslationRequest, TranslationResult, StepTranslation +│ │ +│ ├── verification/ # [EXPANDED] Now the core product +│ │ ├── __init__.py +│ │ ├── lean4_verifier.py # Full Lean 4 verification pipeline +│ │ ├── code_verifier.py # Code execution + property testing +│ │ ├── fol_verifier.py # SMT-LIB / Z3 verification +│ │ ├── cross_validator.py # Multi-miner consensus on verdicts +│ │ ├── verdict.py # VerificationVerdict, StepVerdict, FailureReport +│ │ └── process_supervisor.py # Dependency graph, failure cascade +│ │ +│ ├── certificates/ # [NEW] ZK proof layer +│ │ ├── __init__.py +│ │ ├── schema.py # VerificationCertificate dataclass +│ │ ├── prover.py # ZK proof generation (Halo2/Circom) +│ │ ├── recursive.py # Recursive proof composition +│ │ ├── registry.py # On-chain certificate registration +│ │ └── verifier_client.py # Off-chain certificate verification +│ │ +│ ├── contracts/ # [NEW] Solidity contracts +│ │ ├── CertificateRegistry.sol +│ │ ├── Verifier.sol # Auto-generated from ZK circuit +│ │ ├── deploy.py # Deployment scripts +│ │ └── abi/ # Compiled ABIs +│ │ +│ ├── sdk/ # [NEW] Client SDKs +│ │ ├── python/ +│ │ │ ├── reasonforge/__init__.py +│ │ │ ├── reasonforge/client.py +│ │ │ ├── reasonforge/types.py +│ │ │ └── setup.py +│ │ └── typescript/ +│ │ ├── src/index.ts +│ │ ├── src/client.ts +│ │ ├── src/types.ts +│ │ └── package.json +│ │ +│ └── integrations/ # [NEW] Model provider integrations +│ ├── __init__.py +│ ├── openai.py +│ ├── anthropic.py +│ ├── deepseek.py +│ └── langchain.py +│ +├── circuits/ # [NEW] ZK circuits +│ ├── verification_circuit/ +│ │ ├── src/main.rs # Halo2 circuit (or circom) +│ │ ├── Cargo.toml +│ │ └── params/ # Proving/verification keys +│ └── recursive_circuit/ +│ ├── src/main.rs +│ ├── Cargo.toml +│ └── params/ +│ +├── lean_templates/ # [NEW] Lean 4 project templates +│ ├── lakefile.lean +│ ├── lean-toolchain +│ └── ReasonForge/ +│ └── Template.lean +│ +├── benchmarks/ # [REVISED] Only 3 domains +│ ├── mathematics/ +│ ├── code/ +│ └── logic/ +│ +└── docs/ # [EXPANDED] + ├── [Previous docs] + ├── PROOF_LAYER.md # This document + ├── ZK_ARCHITECTURE.md # ZK circuit documentation + ├── CERTIFICATE_SPEC.md # Certificate format specification + └── INTEGRATION_GUIDE.md # Model provider integration guide +``` + +--- + +## 26. Phase-by-Phase Build Order + +``` +PHASE 1 — DOMAIN NARROWING (Week 1) + Step 1: Remove scientific, strategic, causal, ethical domains from types.py + Step 2: Update task_generator.py for 3 domains only + Step 3: Update all tests for 3-domain model + Step 4: Run: pytest — verify nothing broke + +PHASE 2 — TRANSLATION TYPES (Week 1) + Step 5: Write reasonforge/translation/types.py + Step 6: Write reasonforge/verification/verdict.py (VerificationVerdict, StepVerdict) + Step 7: Write tests for new types + Step 8: Run: pytest + +PHASE 3 — LEAN 4 VERIFICATION (Weeks 2-3) + Step 9: Write lean_templates/ (project scaffold) + Step 10: Write reasonforge/verification/lean4_verifier.py + Step 11: Write 20 test cases with known Lean 4 proofs + Step 12: Write tests/test_lean4.py + Step 13: Run: pytest tests/test_lean4.py -v + Step 14: Benchmark: latency for 5-step / 10-step / 20-step proofs + +PHASE 4 — CODE VERIFICATION (Week 3) + Step 15: Write reasonforge/verification/code_verifier.py + Step 16: Write docker/Dockerfile.sandbox (hardened) + Step 17: Write 20 test cases with known-correct code translations + Step 18: Write tests/test_code_verifier.py + Step 19: Run: pytest tests/test_code_verifier.py -v + +PHASE 5 — FOL VERIFICATION (Week 4) + Step 20: Install Z3 in validator environment + Step 21: Write reasonforge/verification/fol_verifier.py + Step 22: Write 20 test cases with known-valid SMT-LIB formulas + Step 23: Write tests/test_fol_verifier.py + Step 24: Run: pytest tests/test_fol_verifier.py -v + +PHASE 6 — TRANSLATION ENGINE (Weeks 4-5) + Step 25: Write reasonforge/translation/prompts.py (3 domain system prompts) + Step 26: Write reasonforge/translation/parsers.py + Step 27: Write reasonforge/translation/engine.py + Step 28: Write tests — generate translations for 50 benchmark problems + Step 29: Measure: compilation rate, correctness rate per domain + Step 30: Iterate on prompts until compilation rate > 70% + +PHASE 7 — PROCESS SUPERVISION (Week 5) + Step 31: Write reasonforge/verification/process_supervisor.py (dependency DAG) + Step 32: Write reasonforge/verification/cross_validator.py (multi-miner consensus) + Step 33: Write tests for failure cascade, cross-validation + Step 34: Run: pytest + +PHASE 8 — NEW SCORING MECHANISM (Week 6) + Step 35: Replace CMS with CTS in engine.py (new Eq. 2) + Step 36: Update simulator.py for translator miner profiles + Step 37: Update all formula tests + Step 38: Run CLI simulation with new scoring: verify elite translators score highest + Step 39: Run: pytest — all tests pass with new scoring + +PHASE 9 — REVISED BITTENSOR NEURONS (Weeks 6-7) + Step 40: Write new protocol.py (TranslationTask, VerificationResult synapses) + Step 41: Write neurons/miner.py as TranslatorMiner + Step 42: Write neurons/validator.py as ProofLayerValidator + Step 43: Write tests/test_integration_local.py for new pipeline + Step 44: Test on localnet: validator → miner → translation → verification → weights + +PHASE 10 — ZK CERTIFICATES (Weeks 8-10) + Step 45: Design ZK circuit (choose Halo2 or Circom) + Step 46: Write circuits/verification_circuit/ + Step 47: Write circuits/recursive_circuit/ + Step 48: Generate proving + verification keys + Step 49: Write reasonforge/certificates/prover.py + Step 50: Write reasonforge/certificates/recursive.py + Step 51: Write reasonforge/certificates/schema.py + Step 52: Write tests — generate certificate for known verification + Step 53: Benchmark: proving time, proof size, verification time + +PHASE 11 — ON-CHAIN REGISTRY (Weeks 10-11) + Step 54: Write contracts/CertificateRegistry.sol + Step 55: Generate contracts/Verifier.sol from circuit + Step 56: Write contracts/deploy.py + Step 57: Deploy to testnet (Arbitrum Sepolia or Base Sepolia) + Step 58: Write reasonforge/certificates/registry.py (Python ↔ contract) + Step 59: Write reasonforge/certificates/verifier_client.py + Step 60: End-to-end test: verify → prove → register → verify on-chain + +PHASE 12 — ENTERPRISE API (Weeks 11-12) + Step 61: Rewrite gateway/app.py for verification API + Step 62: Write gateway/schemas.py (VerifyRequest, VerifyResponse) + Step 63: Add webhook callbacks for async verification + Step 64: Add pricing/billing logic + Step 65: Write tests/test_api.py + Step 66: Load test: 100 concurrent verification requests + +PHASE 13 — SDKs (Week 12) + Step 67: Write sdk/python/ (pip installable) + Step 68: Write sdk/typescript/ (npm installable) + Step 69: Write integration examples for OpenAI, Anthropic, LangChain + Step 70: Write sdk tests + +PHASE 14 — COMPLIANCE & DOCS (Week 13) + Step 71: Write ComplianceReportGenerator (PDF output) + Step 72: Write docs/PROOF_LAYER.md + Step 73: Write docs/ZK_ARCHITECTURE.md + Step 74: Write docs/CERTIFICATE_SPEC.md + Step 75: Write docs/INTEGRATION_GUIDE.md + Step 76: Update README.md for proof layer positioning + +PHASE 15 — FINAL INTEGRATION (Weeks 14-16) + Step 77: Full end-to-end test: API → subnet → verify → certificate → on-chain + Step 78: Run 100-epoch simulation with new scoring + Step 79: Security audit: adversarial translations, sandbox escape, ZK soundness + Step 80: Performance optimization: parallelize verification, cache embeddings + Step 81: Deploy validator + miner on Bittensor testnet + Step 82: Run for 48 hours, monitor stability + Step 83: Fix issues, re-deploy + Step 84: Write launch blog post + Step 85: Tag v1.0.0 +``` + +--- + +## 27. Success Criteria + +### Must-Have (v1.0) +- [ ] Lean 4 verifier: >80% compilation rate on benchmark math problems +- [ ] Code verifier: >90% test pass rate on benchmark code problems +- [ ] FOL verifier: >85% correct SAT/UNSAT on benchmark logic problems +- [ ] Translation engine: >70% of translations compile on first attempt +- [ ] Cross-validation: 3/5 miner agreement on step verdicts +- [ ] ZK certificate: generated in <30 seconds +- [ ] On-chain verification: <250k gas on EVM +- [ ] API: <60 second latency for 10-step standard verification +- [ ] Localnet: full epoch cycle works end-to-end +- [ ] All tests pass, no regressions from MVP + +### Should-Have (v1.1) +- [ ] Recursive proofs: single proof for N-step chain +- [ ] Python SDK on PyPI +- [ ] TypeScript SDK on NPM +- [ ] OpenAI integration: auto-extract from o1 thinking tokens +- [ ] Compliance report generator: PDF output +- [ ] Grafana dashboards: verification success rates, miner leaderboards +- [ ] Docker deployment: one-command validator/miner setup + +### Nice-to-Have (v1.2+) +- [ ] Anthropic integration: extended thinking extraction +- [ ] LangChain callback middleware +- [ ] Cross-subnet API (other Bittensor subnets query ReasonForge) +- [ ] Multi-chain certificate registry (Ethereum + Arbitrum + Base) +- [ ] Formal verification of the ZK circuit itself (meta-verification) + +--- + +## 28. Dependency Map + +``` +types.py ──────────────────────────────────────────────────────────┐ + │ +translation/types.py ──┬── translation/engine.py │ + │ │ │ + │ ▼ │ + │ translation/prompts.py │ + │ translation/parsers.py │ + │ │ +verification/verdict.py┤ │ + │ │ + ├── verification/lean4_verifier.py │ + ├── verification/code_verifier.py │ + ├── verification/fol_verifier.py │ + │ │ │ + │ ▼ │ + ├── verification/cross_validator.py │ + ├── verification/process_supervisor.py │ + │ │ │ + │ ▼ │ +engine.py ─────────────┼── scoring (CTS replaces CMS) │ + │ │ │ + │ ▼ │ + ├── certificates/prover.py ◄── circuits/ │ + ├── certificates/recursive.py │ + ├── certificates/schema.py │ + │ │ │ + │ ▼ │ + ├── certificates/registry.py ◄── contracts/ │ + │ │ │ + │ ▼ │ +protocol.py ───────────┼── neurons/miner.py (TranslatorMiner) │ + ├── neurons/validator.py (ProofLayerValidator) + │ │ │ + │ ▼ │ + └── gateway/app.py ──► sdk/python/ │ + sdk/typescript/ │ + integrations/ │ +``` + +--- + +*End of Proof Layer build plan. This transforms ReasonForge from "another AI scoring subnet" into "the trust infrastructure for all AI reasoning." The moat is the intersection of formal verification + Bittensor incentives + ZK proofs — a combination that requires exactly the skillset you have.* diff --git a/Project1.docx b/Project1.docx new file mode 100644 index 0000000..94984c4 Binary files /dev/null and b/Project1.docx differ diff --git a/Project2.docx b/Project2.docx new file mode 100644 index 0000000..bef2982 Binary files /dev/null and b/Project2.docx differ diff --git a/api/server.py b/api/server.py index 15242e0..9c9633a 100644 --- a/api/server.py +++ b/api/server.py @@ -9,8 +9,8 @@ import asyncio import json -import sys import os +import sys from typing import Optional # Add parent directory to path so we can import reasonforge @@ -21,39 +21,38 @@ from fastapi.responses import StreamingResponse from pydantic import BaseModel +from reasonforge.simulator import ( + EpochSimulator, + create_default_miners, + create_default_validators, +) from reasonforge.types import ( + BREAKTHROUGH_MULTIPLIER, + BREAKTHROUGH_THRESHOLD, + CONSENSUS_TRIM_DELTA, + CONSENSUS_WEIGHT, DIFFICULTY_MULTIPLIER, DOMAIN_CHECK_WEIGHTS, EMISSION_MINER_SHARE, EMISSION_VALIDATOR_SHARE, + OBJECTIVE_WEIGHT, PEB_ALPHA, PEB_K, PEB_STREAK_CAP, - BREAKTHROUGH_MULTIPLIER, - BREAKTHROUGH_THRESHOLD, - TRAP_RATE, - TRAP_THRESHOLD, - SIMILARITY_THRESHOLD, SIMILARITY_PENALTY, - VAS_SLASH_THRESHOLD, - VAS_SLASH_GAMMA, - VAS_REP_THRESHOLD, - VAS_REP_MAX_MULTIPLIER, + SIMILARITY_THRESHOLD, TASKS_PER_EPOCH, + TRAP_RATE, + TRAP_THRESHOLD, VALIDATORS_PER_TASK, - OBJECTIVE_WEIGHT, - CONSENSUS_WEIGHT, - CONSENSUS_TRIM_DELTA, - W_QUALITY, + VAS_REP_MAX_MULTIPLIER, + VAS_REP_THRESHOLD, + VAS_SLASH_GAMMA, + VAS_SLASH_THRESHOLD, W_ACCURACY, - W_NOVELTY, W_EFFICIENCY, - Domain, -) -from reasonforge.simulator import ( - EpochSimulator, - create_default_miners, - create_default_validators, + W_NOVELTY, + W_QUALITY, ) app = FastAPI( @@ -76,6 +75,7 @@ # Request/Response Models # ────────────────────────────────────────────── + class SimulateRequest(BaseModel): epochs: int = 5 emission: float = 100.0 @@ -86,6 +86,7 @@ class SimulateRequest(BaseModel): # Endpoints # ────────────────────────────────────────────── + @app.get("/api/health") async def health(): """Health check endpoint.""" @@ -154,7 +155,7 @@ async def event_generator(): yield f"data: {data}\n\n" await asyncio.sleep(0.1) # Small delay between epochs - yield "data: {\"done\": true}\n\n" + yield 'data: {"done": true}\n\n' return StreamingResponse( event_generator(), @@ -215,9 +216,7 @@ async def constants(): "CONSENSUS_TRIM_DELTA": CONSENSUS_TRIM_DELTA, }, "difficulty_multiplier": DIFFICULTY_MULTIPLIER, - "domain_check_weights": { - d.value: w for d, w in DOMAIN_CHECK_WEIGHTS.items() - }, + "domain_check_weights": {d.value: w for d, w in DOMAIN_CHECK_WEIGHTS.items()}, } diff --git a/benchmarks/causal/inference.json b/benchmarks/causal/inference.json new file mode 100644 index 0000000..cd97c4a --- /dev/null +++ b/benchmarks/causal/inference.json @@ -0,0 +1,122 @@ +[ + { + "task_id": "causal-infer-001", + "problem": "A study finds that cities with more fire stations have more fires. A journalist concludes that fire stations cause fires. Identify the flaw in this reasoning and propose the correct causal model.", + "domain": "causal", + "difficulty": 2, + "timeout_seconds": 120, + "ground_truth": "This is a classic case of confounding (common cause). The correct causal model: City size → More fire stations AND City size → More fires. Larger cities build more fire stations AND have more fires (more buildings, more people). City size is a confounder. The correlation between fire stations and fires is spurious—it disappears when conditioning on city size (or population). The journalist commits the post hoc ergo propter hoc fallacy and confuses correlation with causation.", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["confounding", "spurious-correlation", "common-cause"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "causal-infer-002", + "problem": "In an observational study, researchers want to estimate the causal effect of a college degree on earnings. They have data on education, earnings, parental income, IQ, and motivation. Draw a plausible causal DAG and identify a valid adjustment set for estimating the causal effect.", + "domain": "causal", + "difficulty": 5, + "timeout_seconds": 300, + "ground_truth": "Plausible DAG: Parental Income → Education, Parental Income → Earnings, IQ → Education, IQ → Earnings, Motivation → Education, Motivation → Earnings, Education → Earnings. Here Parental Income, IQ, and Motivation are confounders (they cause both Education and Earnings). A valid adjustment set: {Parental Income, IQ, Motivation}. By the backdoor criterion, conditioning on these blocks all backdoor paths from Education to Earnings. If Motivation is unmeasured, we have unobserved confounding and cannot identify the causal effect from observational data alone without additional assumptions (e.g., instrumental variables). A possible instrument: distance to nearest college (affects education but not earnings directly).", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["DAG", "backdoor-criterion", "confounding", "adjustment-set"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "causal-infer-003", + "problem": "A hospital study finds that patients who receive a particular drug have higher mortality rates than those who don't. However, when the data is stratified by disease severity, the drug REDUCES mortality in both mild and severe cases. Explain this paradox.", + "domain": "causal", + "difficulty": 4, + "timeout_seconds": 240, + "ground_truth": "This is Simpson's Paradox. The drug is preferentially given to sicker patients (confounding by indication). Severe cases: high mortality (say 80% without drug, 70% with drug). Mild cases: low mortality (say 10% without drug, 5% with drug). But if 90% of drug recipients have severe disease vs 20% of non-recipients, the aggregate mortality can be higher for the drug group despite it helping within each stratum. The correct causal conclusion is that the drug is beneficial. The stratified analysis controls for the confounder (severity). The aggregate analysis is confounded. Resolution: condition on disease severity (a confounder) to get the correct causal estimate. This is why randomized controlled trials are the gold standard—they balance confounders.", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["simpson-paradox", "confounding-by-indication", "stratification"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "causal-infer-004", + "problem": "A tech company runs an A/B test for a new recommendation algorithm. The treatment group shows 5% higher click-through rate (CTR). The p-value is 0.03. A data scientist claims there is a 97% probability that the new algorithm is better. Is this interpretation correct?", + "domain": "causal", + "difficulty": 4, + "timeout_seconds": 240, + "ground_truth": "This interpretation is INCORRECT. The p-value is NOT the probability that the null hypothesis is true (or 1-p that the alternative is true). The p-value = 0.03 means: IF the null hypothesis were true (no real difference), there would be a 3% probability of observing a result at least as extreme as the one obtained. The probability that the algorithm is actually better depends on: (1) the prior probability of improvement, (2) the statistical power, (3) the effect size. This is the classic p-value misinterpretation. To get P(algorithm is better | data), one needs Bayesian analysis. Additionally, consider: multiple testing, practical significance (is 5% CTR lift meaningful?), novelty effects, and whether the sample is representative.", + "ground_truth_score": 0.0, + "is_trap": true, + "previously_unsolved": false, + "tags": ["trap", "p-value", "statistical-inference", "AB-testing", "common-misconception"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "causal-infer-005", + "problem": "Explain the difference between the Average Treatment Effect (ATE), the Average Treatment Effect on the Treated (ATT), and the Local Average Treatment Effect (LATE). When would each be the most relevant estimand?", + "domain": "causal", + "difficulty": 6, + "timeout_seconds": 300, + "ground_truth": "ATE = E[Y(1) - Y(0)] — the average causal effect across the entire population. Most relevant for universal policy decisions (e.g., should we mandate a vaccine for everyone?). ATT = E[Y(1) - Y(0) | T=1] — the average effect among those who actually received treatment. Most relevant when evaluating an existing program (e.g., did the job training program help those who enrolled?). ATE ≠ ATT when there is treatment effect heterogeneity correlated with selection. LATE = E[Y(1) - Y(0) | Compliers] — the average effect among 'compliers' in an instrumental variable (IV) design. Compliers are those who would take treatment when encouraged but not otherwise. Most relevant in IV studies (e.g., effect of military service on earnings using draft lottery as instrument, estimated only for those who served because they were drafted). Key insight: different estimands answer different policy questions, and the choice depends on the decision context.", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["treatment-effects", "ATE", "ATT", "LATE", "estimands"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "causal-infer-006", + "problem": "Design a regression discontinuity study to estimate the causal effect of a scholarship on college GPA. The scholarship is awarded to students with SAT scores above 1400. Describe the key assumptions, potential threats to validity, and how you would implement the analysis.", + "domain": "causal", + "difficulty": 7, + "timeout_seconds": 360, + "ground_truth": "Design: Sharp RD with running variable = SAT score, cutoff = 1400, treatment = scholarship, outcome = college GPA. Key assumptions: (1) Continuity: potential outcomes E[Y(0)|X=x] and E[Y(1)|X=x] are continuous at x=1400. (2) No manipulation: students cannot precisely control their SAT score around the cutoff. (3) No compound treatment: nothing else changes discontinuously at 1400. Implementation: (1) Restrict to a bandwidth around 1400 (e.g., 1300-1500). (2) Fit local polynomial regression on each side of the cutoff. (3) The treatment effect is the discontinuity in the fitted values at x=1400. (4) Use optimal bandwidth selection (Imbens-Kalyanaraman). Threats: (1) Score manipulation (test retaking targeting 1400). (2) Other programs with same cutoff. (3) Functional form misspecification. Validation: test for discontinuities in pre-treatment covariates (demographics) at the cutoff; test for density discontinuity (McCrary test).", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["regression-discontinuity", "quasi-experiment", "causal-identification"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "causal-infer-007", + "problem": "Using Pearl's do-calculus, explain the difference between P(Y|X) and P(Y|do(X)). Give a concrete example where these differ and explain why the distinction matters for causal reasoning.", + "domain": "causal", + "difficulty": 7, + "timeout_seconds": 360, + "ground_truth": "P(Y|X) is the conditional/observational probability: the distribution of Y among units where X was observed to take a particular value. P(Y|do(X)) is the interventional probability: the distribution of Y if we were to externally set X to a particular value, severing all causal arrows into X. Example: X = barometer reading, Y = storm occurrence, Z = atmospheric pressure. P(storm | barometer drops) is high because low pressure causes both. But P(storm | do(barometer drops)) — physically forcing the barometer down — has no effect on storms. The DAG: Z → X and Z → Y. Conditioning on X in P(Y|X) inherits information about Z. The do-operator in P(Y|do(X)) cuts the Z → X edge, so X carries no information about Z. P(Y|do(X)) = Σ_z P(Y|X,Z=z)P(Z=z) (backdoor adjustment), while P(Y|X) = Σ_z P(Y|X,Z=z)P(Z=z|X). This distinction is why observational correlations can be misleading for decision-making: we need interventional quantities for causal predictions.", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["do-calculus", "intervention", "pearl", "observation-vs-intervention"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "causal-infer-008", + "problem": "A difference-in-differences study estimates the effect of a minimum wage increase on employment by comparing employment changes in a state that raised its minimum wage to a neighboring state that didn't, before and after the policy change. What is the key identifying assumption, and how can it be tested?", + "domain": "causal", + "difficulty": 6, + "timeout_seconds": 300, + "ground_truth": "The key identifying assumption is the parallel trends assumption: in the absence of the treatment (minimum wage increase), the treatment and control groups would have followed the same trend in the outcome (employment). This means any level differences between states are constant over time — it's the trends, not the levels, that must match. Testing: (1) Visual inspection: plot outcome trends for both groups in the pre-treatment period. If they are approximately parallel, the assumption is plausible. (2) Formal test: regress outcome on group, time, group×time interactions using only pre-treatment data. Test if pre-treatment group×time coefficients are jointly zero. (3) Placebo tests: apply the DiD estimator at fake treatment dates in the pre-period; estimates should be near zero. (4) Add group-specific linear time trends as robustness check. Threats: differential shocks to local economies, anticipation effects, compositional changes, spillover effects between states.", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["difference-in-differences", "parallel-trends", "quasi-experiment", "policy-evaluation"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + } +] diff --git a/benchmarks/code/algorithms.json b/benchmarks/code/algorithms.json new file mode 100644 index 0000000..cc2af16 --- /dev/null +++ b/benchmarks/code/algorithms.json @@ -0,0 +1,152 @@ +[ + { + "task_id": "code-algo-001", + "problem": "Implement a function that finds the longest increasing subsequence (LIS) in an array of integers. Return the length of the LIS. Your solution must run in O(n log n) time.", + "domain": "code", + "difficulty": 5, + "timeout_seconds": 300, + "ground_truth": "Use patience sorting / binary search approach. Maintain a list 'tails' where tails[i] is the smallest tail element of all increasing subsequences of length i+1. For each element, binary search for the position to replace in tails (or append if larger than all). The length of tails is the LIS length. Example: [10,9,2,5,3,7,101,18] → LIS length = 4.", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["dynamic-programming", "binary-search", "subsequence"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "code-algo-002", + "problem": "Given a directed graph with possibly negative edge weights but no negative cycles, find the shortest path from a source vertex to all other vertices. The graph has V vertices and E edges. Describe the algorithm and its time complexity.", + "domain": "code", + "difficulty": 4, + "timeout_seconds": 240, + "ground_truth": "Use the Bellman-Ford algorithm. Initialize dist[source] = 0, dist[v] = ∞ for all others. Repeat V-1 times: for each edge (u,v,w), if dist[u] + w < dist[v], update dist[v] = dist[u] + w. Time complexity: O(V·E). An additional pass can detect negative cycles (if any distance is updated in the V-th iteration).", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["graph", "shortest-path", "bellman-ford"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "code-algo-003", + "problem": "You need to sort a list of 1 million 32-bit integers. A junior developer suggests using quicksort because it is always O(n log n). Is this reasoning correct? What would you recommend?", + "domain": "code", + "difficulty": 3, + "timeout_seconds": 180, + "ground_truth": "The reasoning is INCORRECT. Quicksort's worst case is O(n²), not O(n log n). The average case is O(n log n) with good pivot selection. For practical use, recommend: (1) Quicksort with randomized pivot or median-of-three to avoid worst case, (2) Introsort (quicksort that falls back to heapsort at recursion depth limit), which guarantees O(n log n), or (3) For 32-bit integers specifically, radix sort achieves O(n) time. Most standard library sorts use introsort or Timsort.", + "ground_truth_score": 0.0, + "is_trap": true, + "previously_unsolved": false, + "tags": ["trap", "sorting", "complexity-analysis", "common-misconception"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "code-algo-004", + "problem": "Design a data structure that supports the following operations in O(1) amortized time: insert(val), remove(val), getRandom() — which returns a random element uniformly at random.", + "domain": "code", + "difficulty": 5, + "timeout_seconds": 300, + "ground_truth": "Use a combination of a dynamic array (list) and a hash map. The hash map maps values to their indices in the array. insert(val): append to array, store index in map. O(1). remove(val): swap the element with the last element in the array, update the map for the swapped element, then pop the last element and remove from map. O(1). getRandom(): generate random index in [0, len-1] and return array[index]. O(1).", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["data-structure", "hash-map", "randomization", "design"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "code-algo-005", + "problem": "Implement an algorithm to find all articulation points (cut vertices) in an undirected connected graph with V vertices and E edges. The algorithm should run in O(V + E) time.", + "domain": "code", + "difficulty": 7, + "timeout_seconds": 360, + "ground_truth": "Use Tarjan's algorithm with DFS. Maintain discovery time disc[u] and the lowest discovery time reachable low[u]. A vertex u is an articulation point if: (1) u is the root of the DFS tree and has 2+ children, OR (2) u is not root and has a child v where low[v] >= disc[u] (meaning no back edge from subtree of v reaches an ancestor of u). Time: O(V + E) since it's a single DFS pass.", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["graph", "DFS", "tarjan", "articulation-points"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "code-algo-006", + "problem": "Given an array of n integers, find a contiguous subarray whose sum is closest to zero. Your solution should be more efficient than O(n²).", + "domain": "code", + "difficulty": 6, + "timeout_seconds": 300, + "ground_truth": "Compute prefix sums P[0..n] where P[0]=0 and P[i] = A[0]+...+A[i-1]. The sum of subarray A[i..j-1] = P[j] - P[i]. To minimize |P[j] - P[i]|, sort the prefix sums (keeping original indices) and check adjacent pairs in sorted order. The minimum difference among adjacent sorted prefix sums gives the answer. Time: O(n log n) for sorting.", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["prefix-sum", "sorting", "subarray"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "code-algo-007", + "problem": "A developer claims that using a hash table guarantees O(1) lookup time for any input. They propose replacing all binary search tree operations in a codebase with hash table operations to improve performance universally. Evaluate this claim.", + "domain": "code", + "difficulty": 4, + "timeout_seconds": 240, + "ground_truth": "This claim is MISLEADING. Hash table O(1) is the average/amortized case with a good hash function. Worst case is O(n) due to collisions. BSTs provide O(log n) worst case (if balanced) and support ordered operations (range queries, in-order traversal, finding min/max, predecessor/successor) that hash tables cannot do efficiently. The replacement is inappropriate when: (1) ordered operations are needed, (2) worst-case guarantees matter, (3) adversarial inputs are possible (hash DoS), (4) memory locality matters (BSTs can be cache-friendlier for certain access patterns).", + "ground_truth_score": 0.0, + "is_trap": true, + "previously_unsolved": false, + "tags": ["trap", "hash-table", "BST", "complexity", "trade-offs"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "code-algo-008", + "problem": "Implement a solution for the 0/1 knapsack problem: given n items with weights w[i] and values v[i], and a knapsack capacity W, find the maximum total value that can be carried. Items cannot be fractioned.", + "domain": "code", + "difficulty": 5, + "timeout_seconds": 300, + "ground_truth": "Use dynamic programming. Define dp[i][j] = max value using items 1..i with capacity j. dp[0][j] = 0. For each item i: dp[i][j] = max(dp[i-1][j], dp[i-1][j-w[i]] + v[i]) if j >= w[i], else dp[i-1][j]. Answer: dp[n][W]. Space optimization: use 1D array, iterate j from W down to w[i]. Time: O(nW), Space: O(W).", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["dynamic-programming", "knapsack", "optimization"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "code-algo-009", + "problem": "Given a string of length n, find the longest palindromic substring in O(n) time.", + "domain": "code", + "difficulty": 8, + "timeout_seconds": 420, + "ground_truth": "Use Manacher's algorithm. Transform string by inserting special characters (e.g., '#') between each character and at boundaries. Maintain a center C and right boundary R of the rightmost palindrome found so far. For each position i, use the mirror property: p[i] = min(R-i, p[2C-i]) if i < R, else p[i] = 0. Then try to expand around i. Update C, R when i + p[i] > R. The array p gives palindrome radii. Time: O(n) amortized since R only increases.", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["string", "palindrome", "manacher", "linear-time"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "code-algo-010", + "problem": "Implement a concurrent lock-free queue that supports enqueue and dequeue operations from multiple threads without using mutexes. Describe the algorithm and discuss the ABA problem.", + "domain": "code", + "difficulty": 9, + "timeout_seconds": 480, + "ground_truth": "Use the Michael-Scott lock-free queue. Maintain Head and Tail pointers to sentinel/dummy nodes. Enqueue: create new node, CAS Tail->next from NULL to new node, then CAS Tail to new node. Dequeue: read Head->next, if non-null CAS Head to Head->next and return value. The ABA problem occurs when a CAS succeeds even though the value changed from A→B→A between read and CAS. Solve with tagged pointers (version counters) or hazard pointers for safe memory reclamation. Use atomic compare-and-swap (CAS) as the fundamental primitive.", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["concurrency", "lock-free", "queue", "CAS", "ABA-problem"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + } +] diff --git a/benchmarks/code/systems.json b/benchmarks/code/systems.json new file mode 100644 index 0000000..fc6fa8b --- /dev/null +++ b/benchmarks/code/systems.json @@ -0,0 +1,122 @@ +[ + { + "task_id": "code-systems-001", + "problem": "Design a URL shortening service (like bit.ly) that handles 100 million URLs per day. Describe the system architecture, database choice, hashing strategy, and how you would handle collisions.", + "domain": "code", + "difficulty": 5, + "timeout_seconds": 360, + "ground_truth": "Key components: (1) API servers behind a load balancer. (2) Use base62 encoding of an auto-incrementing counter or MD5/SHA256 hash truncated to 7 characters for short codes. (3) Database: NoSQL (DynamoDB/Cassandra) for high write throughput, keyed by short_code → {original_url, created_at, expiry}. (4) Cache layer (Redis/Memcached) for hot URLs (80/20 rule). (5) Collision handling: check-and-retry with hash, or use counter-based approach (no collisions). (6) Rate limiting per user/IP. (7) Analytics: async write to event queue. Scale: 100M/day ≈ 1157 writes/sec, easily handled by distributed DB.", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["system-design", "url-shortener", "distributed-systems", "hashing"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "code-systems-002", + "problem": "Design a distributed rate limiter that enforces API rate limits across multiple server instances. It must handle 50,000 requests per second and provide accurate limiting within a 1% error margin.", + "domain": "code", + "difficulty": 7, + "timeout_seconds": 420, + "ground_truth": "Approaches: (1) Token Bucket via Redis: use MULTI/EXEC with Lua script for atomicity. Key per user with token count and last refill timestamp. (2) Sliding Window Counter: combine fixed window counts with interpolation for the sliding effect—store counts in Redis with TTL. (3) For distributed consistency: use Redis Cluster with hash slots ensuring same user always hits same shard. (4) To handle Redis failures: local in-memory fallback with eventual sync. (5) For 50K RPS: Redis handles ~100K ops/sec per node, so 1-2 Redis nodes suffice. Use connection pooling. (6) Race conditions: Lua scripts in Redis are atomic. (7) Accuracy: sliding window log gives exact results but uses more memory; sliding window counter with interpolation gives <1% error.", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["system-design", "rate-limiting", "distributed-systems", "redis"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "code-systems-003", + "problem": "Design a real-time collaborative text editor (like Google Docs) that supports 100 concurrent users editing the same document simultaneously with sub-200ms latency.", + "domain": "code", + "difficulty": 9, + "timeout_seconds": 600, + "ground_truth": "Core algorithm: Use CRDTs (Conflict-free Replicated Data Types), specifically a sequence CRDT like RGA or LSEQ, OR use Operational Transformation (OT) with a centralized server. Architecture: (1) WebSocket connections for real-time bi-directional communication. (2) Each client maintains local document state and applies changes optimistically. (3) Operations sent to server which transforms/orders them and broadcasts to all clients. (4) For CRDT approach: each character has a unique ID (Lamport timestamp + site ID), insertions and deletions commute. (5) Cursor/selection sync via presence channel. (6) Persistence: periodic snapshots to database with operation log for recovery. (7) For 100 users: single server can handle this; for scale, partition by document. (8) Undo: maintain per-user operation stack with inverse operations.", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["system-design", "CRDT", "real-time", "collaborative-editing"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "code-systems-004", + "problem": "Design a distributed task queue (like Celery or AWS SQS) that guarantees at-least-once delivery, supports priority scheduling, dead-letter queues, and handles worker failures gracefully. Target throughput: 10,000 tasks/second.", + "domain": "code", + "difficulty": 7, + "timeout_seconds": 420, + "ground_truth": "Architecture: (1) Message broker: use a partitioned log (Kafka-like) or Redis Streams for ordering and persistence. (2) At-least-once delivery: workers ACK after completion; unACKed messages are re-enqueued after visibility timeout. (3) Priority: use multiple queues (high/medium/low) with weighted consumption, or a priority heap backed by sorted set in Redis. (4) Dead-letter queue: after N failed attempts, move to DLQ for manual inspection. Track retry count in message metadata. (5) Worker failure: heartbeat mechanism; coordinator reassigns tasks from dead workers. (6) Idempotency: consumers must handle duplicates (use task ID deduplication). (7) Persistence: WAL + periodic snapshots. (8) Scale: partition tasks by key, add consumers per partition. 10K/sec is achievable with Kafka or Redis Streams on modest hardware.", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["system-design", "message-queue", "distributed-systems", "reliability"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "code-systems-005", + "problem": "Design a content delivery network (CDN) from scratch. Explain how you would handle cache invalidation, content routing, origin shielding, and TLS termination for serving static assets globally with p99 latency under 50ms.", + "domain": "code", + "difficulty": 8, + "timeout_seconds": 480, + "ground_truth": "Architecture: (1) Edge servers (PoPs) in 50+ global locations, each with SSD-backed cache. (2) DNS-based routing: GeoDNS directs users to nearest PoP based on IP geolocation; anycast for additional resilience. (3) Cache strategy: pull-based (lazy loading on cache miss), with TTL and Cache-Control headers. (4) Cache invalidation: purge API that fans out to all PoPs via internal pub/sub; use surrogate keys for group invalidation; stale-while-revalidate for availability. (5) Origin shielding: intermediate cache layer between edge and origin to reduce origin load; only shield nodes fetch from origin. (6) TLS termination at edge with session resumption (TLS 1.3, 0-RTT). Certificate management via automated Let's Encrypt or managed PKI. (7) Consistent hashing for distributing content across cache nodes within a PoP. (8) Health checks and automatic failover between PoPs.", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["system-design", "CDN", "caching", "networking", "global-scale"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "code-systems-006", + "problem": "Design a search autocomplete system that serves suggestions within 100ms for a platform with 500 million queries per day. Describe data structures, ranking, and how you handle new/trending queries.", + "domain": "code", + "difficulty": 6, + "timeout_seconds": 360, + "ground_truth": "Architecture: (1) Data structure: Trie with top-K suggestions cached at each node, rebuilt periodically. For production, use a compressed trie (Patricia trie) or precomputed prefix → suggestions mapping. (2) Ranking: frequency-weighted, with time decay (recent queries weighted higher). Combine historical frequency with real-time trending signal. (3) Serving: prefix lookup from in-memory trie, return top 10 suggestions. (4) Updates: batch rebuild trie from query logs every 15 min; for real-time trending, maintain a streaming top-K (Count-Min Sketch + heap) that feeds into a fast-update layer. (5) Infrastructure: shard by prefix (a-m on shard 1, n-z on shard 2, etc.). Replicate each shard. (6) Personalization: blend global suggestions with user-specific recent queries stored in user session cache. (7) Filtering: remove offensive/inappropriate suggestions via blocklist.", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["system-design", "autocomplete", "trie", "ranking", "real-time"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "code-systems-007", + "problem": "Design a distributed consensus system for a 5-node cluster that maintains strong consistency for a key-value store. Explain leader election, log replication, and how the system handles network partitions.", + "domain": "code", + "difficulty": 9, + "timeout_seconds": 540, + "ground_truth": "Implement Raft consensus protocol. (1) Leader election: nodes start as followers with randomized election timeouts (150-300ms). On timeout, become candidate, increment term, vote for self, request votes from others. Win with majority (3/5). (2) Log replication: leader appends entries to log, sends AppendEntries RPCs to followers. Entry committed when replicated on majority. (3) Safety: election restriction ensures candidate's log is at least as up-to-date as majority. (4) Network partitions: minority partition cannot elect leader (need 3/5 votes), so no split-brain. Old leader in minority discovers new term and steps down. (5) Client interaction: linearizability via leader reads with read index or lease-based reads. (6) Membership changes: joint consensus or single-server changes. (7) Log compaction: snapshotting state machine periodically, truncating log.", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["system-design", "consensus", "raft", "distributed-systems", "fault-tolerance"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "code-systems-008", + "problem": "Design a real-time fraud detection system for a payment processor handling 5,000 transactions per second. The system must flag suspicious transactions within 50ms while maintaining a false positive rate below 0.1%.", + "domain": "code", + "difficulty": 8, + "timeout_seconds": 480, + "ground_truth": "Architecture: (1) Streaming pipeline: Kafka for ingestion → Flink/Spark Streaming for real-time processing. (2) Feature computation: maintain per-user sliding windows of transaction amounts, frequencies, locations in Redis (e.g., last 1h, 24h, 7d aggregates). (3) Rule engine: fast deterministic rules (velocity checks, geographic impossibility, amount thresholds) as first filter. (4) ML model: gradient-boosted tree or neural network model for scoring, served via low-latency model server (TensorFlow Serving). Features: transaction amount vs user average, time since last transaction, merchant category, device fingerprint, IP risk score. (5) Decision: combine rule score + ML score. Flag if above threshold. (6) Feedback loop: labeled outcomes feed back into model retraining (weekly batch). (7) For 50ms latency: pre-compute features, keep model in memory, parallelize feature fetch and model inference. (8) False positive minimization: calibrate threshold on validation set; use precision-recall curve to find 0.1% FPR operating point.", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["system-design", "fraud-detection", "streaming", "ML-serving", "real-time"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + } +] diff --git a/benchmarks/mathematics/algebra.json b/benchmarks/mathematics/algebra.json new file mode 100644 index 0000000..f16b18f --- /dev/null +++ b/benchmarks/mathematics/algebra.json @@ -0,0 +1,152 @@ +[ + { + "task_id": "math-algebra-001", + "problem": "Find all real solutions to the equation x^4 - 5x^2 + 4 = 0.", + "domain": "mathematics", + "difficulty": 2, + "timeout_seconds": 120, + "ground_truth": "x = -2, -1, 1, 2. Substitute u = x^2 to get u^2 - 5u + 4 = 0, which factors as (u-1)(u-4) = 0, giving u = 1 or u = 4, hence x = ±1 or x = ±2.", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["polynomial", "substitution", "factoring"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "math-algebra-002", + "problem": "Solve the system of equations: x + y + z = 6, xy + yz + xz = 11, xyz = 6. Find all real solutions (x, y, z).", + "domain": "mathematics", + "difficulty": 4, + "timeout_seconds": 180, + "ground_truth": "The values x, y, z are roots of t^3 - 6t^2 + 11t - 6 = 0, which factors as (t-1)(t-2)(t-3) = 0. All permutations of (1, 2, 3) are solutions.", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["symmetric-polynomials", "vieta", "systems"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "math-algebra-003", + "problem": "Determine whether the following statement is true or false: For all real numbers a and b, sqrt(a^2 + b^2) = a + b.", + "domain": "mathematics", + "difficulty": 1, + "timeout_seconds": 60, + "ground_truth": "False. This equality holds only when ab = 0 and both a, b >= 0. A simple counterexample: a = 1, b = 1 gives sqrt(2) ≠ 2. The correct identity is sqrt(a^2 + b^2) >= |a| with equality iff b = 0.", + "ground_truth_score": 0.0, + "is_trap": true, + "previously_unsolved": false, + "tags": ["trap", "common-misconception", "radicals"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "math-algebra-004", + "problem": "Let f(x) = x^3 + ax^2 + bx + c be a polynomial with integer coefficients. If f(1) = 0, f(-2) = 0, and f(3) = 20, find the values of a, b, and c.", + "domain": "mathematics", + "difficulty": 5, + "timeout_seconds": 240, + "ground_truth": "Since f(1) = 0 and f(-2) = 0, (x-1) and (x+2) are factors. So f(x) = (x-1)(x+2)(x-k) for some k. Expanding: f(x) = (x^2+x-2)(x-k). f(3) = (9+3-2)(3-k) = 10(3-k) = 20, so k = 1. Thus f(x) = (x-1)^2(x+2) = x^3 - 3x + 2, giving a = 0, b = -3, c = 2. Wait—recheck: (x-1)(x+2)(x-1) = (x-1)^2(x+2) = (x^2-2x+1)(x+2) = x^3 + 2x^2 - 2x^2 - 4x + x + 2 = x^3 - 3x + 2. So a = 0, b = -3, c = 2.", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["polynomial", "factor-theorem", "systems"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "math-algebra-005", + "problem": "Find the sum of the infinite geometric series: 3 + 3/2 + 3/4 + 3/8 + ...", + "domain": "mathematics", + "difficulty": 2, + "timeout_seconds": 90, + "ground_truth": "This is a geometric series with first term a = 3 and common ratio r = 1/2. The sum is a/(1-r) = 3/(1-1/2) = 3/(1/2) = 6.", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["series", "geometric", "convergence"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "math-algebra-006", + "problem": "Find all complex roots of z^4 + 4 = 0.", + "domain": "mathematics", + "difficulty": 6, + "timeout_seconds": 240, + "ground_truth": "z^4 = -4. Write -4 = 4·e^(iπ). The four roots are z_k = sqrt(2)·e^(i(π+2kπ)/4) for k = 0,1,2,3. These are z = 1+i, -1+i, -1-i, 1-i. Alternatively, z^4 + 4 = (z^2+2z+2)(z^2-2z+2) by Sophie Germain identity, giving roots via quadratic formula.", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["complex-numbers", "roots-of-unity", "factoring"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "math-algebra-007", + "problem": "A student claims that for the equation x^2 + bx + c = 0, if the discriminant b^2 - 4c equals zero, then x = -b. Is this correct? If not, what is the correct solution?", + "domain": "mathematics", + "difficulty": 3, + "timeout_seconds": 120, + "ground_truth": "Incorrect. When the discriminant is zero, there is one repeated root x = -b/(2·1) = -b/2, not x = -b. The student forgot to divide by 2a. For example, x^2 + 4x + 4 = 0 has b^2 - 4c = 0 and root x = -2, not x = -4.", + "ground_truth_score": 0.0, + "is_trap": true, + "previously_unsolved": false, + "tags": ["trap", "quadratic-formula", "discriminant", "common-error"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "math-algebra-008", + "problem": "Let A be a 3×3 matrix with eigenvalues 1, 2, and 3. Compute det(A^2 + A - 6I), where I is the identity matrix.", + "domain": "mathematics", + "difficulty": 7, + "timeout_seconds": 300, + "ground_truth": "If λ is an eigenvalue of A, then λ^2 + λ - 6 is an eigenvalue of A^2 + A - 6I. For λ=1: 1+1-6 = -4. For λ=2: 4+2-6 = 0. For λ=3: 9+3-6 = 6. det(A^2 + A - 6I) = (-4)(0)(6) = 0.", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["linear-algebra", "eigenvalues", "determinant", "matrix-polynomial"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "math-algebra-009", + "problem": "Prove or disprove: If p(x) is a polynomial of degree n with n distinct real roots, and q(x) = p(x) + 1, then q(x) has at most n-1 real roots.", + "domain": "mathematics", + "difficulty": 8, + "timeout_seconds": 360, + "ground_truth": "This is false in general. Consider p(x) = x^2 - 1 (degree 2, roots ±1). Then q(x) = x^2, which has a double root at 0—but only 1 distinct real root, so the claim holds here. However, consider p(x) = x(x-3)(x-6) and q(x) = p(x)+1. Since p has local max and local min at distinct levels, q can still cross zero 3 times if the shift by 1 doesn't move the curve above all x-intercepts. Actually the statement IS false: take p(x) = (x-0)(x-10) = x^2-10x, which has min value -25 at x=5. Then q(x) = x^2-10x+1 has discriminant 96 > 0, so 2 distinct real roots. n-1 = 1, but q has 2 real roots. Disproved.", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["proof", "polynomial", "roots", "counterexample"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "math-algebra-010", + "problem": "Find the Galois group of the polynomial x^4 - 2 over the rationals Q.", + "domain": "mathematics", + "difficulty": 10, + "timeout_seconds": 600, + "ground_truth": "The roots are ±2^(1/4) and ±i·2^(1/4). The splitting field is Q(2^(1/4), i), which has degree 8 over Q. The Galois group is the dihedral group D_4 of order 8. It is generated by σ: 2^(1/4) ↦ i·2^(1/4), i ↦ i and τ: 2^(1/4) ↦ 2^(1/4), i ↦ -i, with relations σ^4 = τ^2 = e and τστ = σ^{-1}.", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["galois-theory", "field-extensions", "group-theory", "advanced"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + } +] diff --git a/benchmarks/mathematics/calculus.json b/benchmarks/mathematics/calculus.json new file mode 100644 index 0000000..b369a8d --- /dev/null +++ b/benchmarks/mathematics/calculus.json @@ -0,0 +1,152 @@ +[ + { + "task_id": "math-calculus-001", + "problem": "Compute the derivative of f(x) = x^3 · ln(x) for x > 0.", + "domain": "mathematics", + "difficulty": 1, + "timeout_seconds": 90, + "ground_truth": "Using the product rule: f'(x) = 3x^2·ln(x) + x^3·(1/x) = 3x^2·ln(x) + x^2 = x^2(3·ln(x) + 1).", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["derivative", "product-rule", "logarithm"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "math-calculus-002", + "problem": "Evaluate the definite integral ∫₀¹ x·e^(x²) dx.", + "domain": "mathematics", + "difficulty": 3, + "timeout_seconds": 120, + "ground_truth": "Let u = x², du = 2x dx, so x dx = du/2. When x=0, u=0; when x=1, u=1. The integral becomes (1/2)∫₀¹ e^u du = (1/2)(e-1) = (e-1)/2 ≈ 0.8591.", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["integration", "u-substitution", "definite-integral"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "math-calculus-003", + "problem": "Find the radius of convergence of the power series Σ (n=0 to ∞) n! · x^n / n^n.", + "domain": "mathematics", + "difficulty": 5, + "timeout_seconds": 240, + "ground_truth": "Apply the ratio test: |a_{n+1}/a_n| = |(n+1)!·x^{n+1}·n^n| / |n!·x^n·(n+1)^{n+1}| = |x|·(n/(n+1))^n = |x|·(1/(1+1/n))^n → |x|/e as n→∞. The series converges when |x|/e < 1, so the radius of convergence is R = e.", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["power-series", "radius-of-convergence", "ratio-test"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "math-calculus-004", + "problem": "Find the volume of the solid obtained by revolving the region bounded by y = sin(x), y = 0, x = 0, and x = π about the x-axis.", + "domain": "mathematics", + "difficulty": 4, + "timeout_seconds": 180, + "ground_truth": "Using the disk method: V = π∫₀^π sin²(x) dx = π∫₀^π (1 - cos(2x))/2 dx = (π/2)[x - sin(2x)/2]₀^π = (π/2)(π - 0) = π²/2.", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["volume-of-revolution", "disk-method", "trigonometric-integration"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "math-calculus-005", + "problem": "Determine whether the improper integral ∫₁^∞ sin(x)/x² dx converges or diverges. If it converges, find bounds on its value.", + "domain": "mathematics", + "difficulty": 6, + "timeout_seconds": 240, + "ground_truth": "Since |sin(x)/x²| ≤ 1/x² for all x ≥ 1, and ∫₁^∞ 1/x² dx = 1 converges, by the comparison test the integral converges absolutely. The value lies in (0, 1). More precisely, integration by parts gives ∫₁^∞ sin(x)/x² dx = [-cos(x)/x²]₁^∞ - ∫₁^∞ 2cos(x)/x³ dx. The numerical value is approximately 0.5041.", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["improper-integral", "comparison-test", "convergence"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "math-calculus-006", + "problem": "Let f(x, y) = x²y + xy³. Compute all second-order partial derivatives and verify that f_xy = f_yx.", + "domain": "mathematics", + "difficulty": 3, + "timeout_seconds": 180, + "ground_truth": "f_x = 2xy + y³, f_y = x² + 3xy². f_xx = 2y, f_yy = 6xy, f_xy = 2x + 3y², f_yx = 2x + 3y². Indeed f_xy = f_yx = 2x + 3y², confirming Clairaut's theorem (both mixed partials are continuous).", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["partial-derivatives", "multivariable", "clairaut"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "math-calculus-007", + "problem": "Use Stokes' theorem to evaluate ∮_C F · dr where F = (y², -x², z²) and C is the boundary of the triangle with vertices (1,0,0), (0,1,0), (0,0,1) oriented counterclockwise when viewed from the origin.", + "domain": "mathematics", + "difficulty": 8, + "timeout_seconds": 420, + "ground_truth": "By Stokes' theorem, ∮_C F·dr = ∬_S (curl F)·dS. curl F = (0-0, 0-0, -2x-2y) = (0, 0, -2x-2y). The surface S is the plane x+y+z=1 with normal n = (1,1,1)/√3. dS = (1,1,1) dA (upward normal). So the integral becomes ∬_D (0·1 + 0·1 + (-2x-2y)·1) dA = -2∬_D (x+y) dA over the region D: x≥0, y≥0, x+y≤1. ∬_D (x+y) dA = ∫₀¹∫₀^{1-x} (x+y) dy dx = ∫₀¹ [xy + y²/2]₀^{1-x} dx = ∫₀¹ (x(1-x) + (1-x)²/2) dx = ∫₀¹ (x - x² + 1/2 - x + x²/2) dx = ∫₀¹ (1/2 - x²/2) dx = [x/2 - x³/6]₀¹ = 1/2 - 1/6 = 1/3. Answer: -2/3.", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["stokes-theorem", "vector-calculus", "surface-integral", "curl"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "math-calculus-008", + "problem": "Find the Taylor series of f(x) = 1/(1+x²) centered at x=0 and determine its interval of convergence.", + "domain": "mathematics", + "difficulty": 4, + "timeout_seconds": 180, + "ground_truth": "Using the geometric series 1/(1+u) = Σ(-1)^n u^n for |u|<1, substitute u = x²: 1/(1+x²) = Σ_{n=0}^∞ (-1)^n x^{2n} = 1 - x² + x⁴ - x⁶ + ... The series converges for |x²|<1, i.e., |x|<1. At x=±1, the series (-1)^n does not converge. Interval of convergence: (-1, 1).", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["taylor-series", "geometric-series", "convergence"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "math-calculus-009", + "problem": "Solve the differential equation y'' + 4y' + 4y = e^(-2x) with initial conditions y(0) = 0, y'(0) = 1.", + "domain": "mathematics", + "difficulty": 7, + "timeout_seconds": 360, + "ground_truth": "Characteristic equation: r² + 4r + 4 = (r+2)² = 0, giving r = -2 (double root). Homogeneous solution: y_h = (C₁ + C₂x)e^(-2x). For particular solution, since e^(-2x) corresponds to the double root, try y_p = Ax²e^(-2x). y_p' = A(2x - 2x²)e^(-2x), y_p'' = A(2 - 8x + 4x²)e^(-2x). Substituting: A(2-8x+4x²)e^(-2x) + 4A(2x-2x²)e^(-2x) + 4Ax²e^(-2x) = e^(-2x). Simplifying: 2Ae^(-2x) = e^(-2x), so A = 1/2. General solution: y = (C₁ + C₂x)e^(-2x) + (x²/2)e^(-2x). Apply ICs: y(0) = C₁ = 0. y' = (C₂ - 2C₂x + x - x²)e^(-2x), y'(0) = C₂ = 1. Solution: y = (x + x²/2)e^(-2x).", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["ODE", "second-order", "method-of-undetermined-coefficients"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "math-calculus-010", + "problem": "Evaluate the integral ∫₀^∞ e^(-x²) dx using the Gaussian integral technique.", + "domain": "mathematics", + "difficulty": 6, + "timeout_seconds": 300, + "ground_truth": "Let I = ∫₀^∞ e^(-x²) dx. Then I² = (∫₀^∞ e^(-x²) dx)(∫₀^∞ e^(-y²) dy) = ∫₀^∞∫₀^∞ e^(-(x²+y²)) dx dy. Convert to polar coordinates: I² = ∫₀^{π/2}∫₀^∞ e^(-r²) r dr dθ = (π/2)·[-e^(-r²)/2]₀^∞ = (π/2)·(1/2) = π/4. Therefore I = √π/2.", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["gaussian-integral", "polar-coordinates", "multivariable"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + } +] diff --git a/benchmarks/mathematics/number_theory.json b/benchmarks/mathematics/number_theory.json new file mode 100644 index 0000000..f2670fd --- /dev/null +++ b/benchmarks/mathematics/number_theory.json @@ -0,0 +1,122 @@ +[ + { + "task_id": "math-numtheory-001", + "problem": "Find the greatest common divisor of 252 and 198 using the Euclidean algorithm. Show each step.", + "domain": "mathematics", + "difficulty": 1, + "timeout_seconds": 90, + "ground_truth": "252 = 1·198 + 54; 198 = 3·54 + 36; 54 = 1·36 + 18; 36 = 2·18 + 0. Therefore gcd(252, 198) = 18.", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["gcd", "euclidean-algorithm"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "math-numtheory-002", + "problem": "Prove that if p is a prime greater than 3, then p² - 1 is divisible by 24.", + "domain": "mathematics", + "difficulty": 4, + "timeout_seconds": 240, + "ground_truth": "Since p > 3 is prime, p is odd, so p = 2k+1 for some integer k. Then p² - 1 = (p-1)(p+1) = 2k(2k+2) = 4k(k+1). Since k(k+1) is the product of consecutive integers, one is even, so k(k+1) = 2m. Thus p²-1 = 8m, so 8 | (p²-1). Also, since p is not divisible by 3, p ≡ 1 or 2 (mod 3). If p ≡ 1, then p-1 ≡ 0 (mod 3). If p ≡ 2, then p+1 ≡ 0 (mod 3). Either way 3 | (p²-1). Since gcd(8,3) = 1, we have 24 | (p²-1).", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["divisibility", "primes", "proof"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "math-numtheory-003", + "problem": "Find the last two digits of 7^2026.", + "domain": "mathematics", + "difficulty": 5, + "timeout_seconds": 240, + "ground_truth": "We need 7^2026 mod 100. By Euler's theorem, φ(100) = 40, so 7^40 ≡ 1 (mod 100). 2026 = 40·50 + 26, so 7^2026 ≡ 7^26 (mod 100). Compute: 7^1=7, 7^2=49, 7^4=2401≡1 (mod 100). Wait—7^4 = 2401 mod 100 = 1. So 7^26 = 7^(4·6+2) = (7^4)^6 · 7^2 ≡ 1^6 · 49 = 49 (mod 100). The last two digits are 49.", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["modular-arithmetic", "euler-theorem", "exponentiation"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "math-numtheory-004", + "problem": "How many positive divisors does 10! have?", + "domain": "mathematics", + "difficulty": 3, + "timeout_seconds": 180, + "ground_truth": "10! = 3628800. Prime factorization: 10! = 2^8 · 3^4 · 5^2 · 7^1. The number of divisors is (8+1)(4+1)(2+1)(1+1) = 9·5·3·2 = 270.", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["divisor-function", "prime-factorization", "factorial"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "math-numtheory-005", + "problem": "Find all integer solutions to the equation 3x + 7y = 1.", + "domain": "mathematics", + "difficulty": 3, + "timeout_seconds": 180, + "ground_truth": "Since gcd(3,7) = 1, solutions exist. A particular solution: x = -2, y = 1 (since 3(-2)+7(1) = -6+7 = 1). The general solution is x = -2 + 7t, y = 1 - 3t for all integers t.", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["diophantine-equation", "bezout", "linear-equation"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "math-numtheory-006", + "problem": "Determine whether 2^67 - 1 is prime (this is the Mersenne number M_67).", + "domain": "mathematics", + "difficulty": 7, + "timeout_seconds": 360, + "ground_truth": "M_67 = 2^67 - 1 = 147573952589676412927 is NOT prime. It was famously claimed by Mersenne to be prime, but in 1903 Frank Nelson Cole showed it factors as 193707721 × 761838257287. This is a historically notable example of an incorrect primality claim.", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["mersenne-primes", "factorization", "historical"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "math-numtheory-007", + "problem": "Prove that the sum of the reciprocals of the prime numbers diverges: Σ_{p prime} 1/p = ∞.", + "domain": "mathematics", + "difficulty": 9, + "timeout_seconds": 480, + "ground_truth": "Euler's proof: Suppose the sum converges. Consider the Euler product: Π_{p prime} 1/(1-1/p) = Σ_{n=1}^∞ 1/n (harmonic series). Taking logarithms: Σ_p -ln(1-1/p) = Σ_p (1/p + 1/(2p²) + ...) which lies between Σ 1/p and 2·Σ 1/p (since the higher terms converge). But ln(Π 1/(1-1/p)) = ln(Σ 1/n) → ∞. So Σ 1/p must diverge. Alternative: Erdős's combinatorial proof using the fact that if Σ 1/p converged, the count of integers up to N with all prime factors from a finite set would be o(N), contradicting that every integer has a prime factorization.", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["primes", "divergence", "proof", "euler-product"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "math-numtheory-008", + "problem": "Use the Chinese Remainder Theorem to find the smallest positive integer x such that x ≡ 2 (mod 3), x ≡ 3 (mod 5), and x ≡ 2 (mod 7).", + "domain": "mathematics", + "difficulty": 5, + "timeout_seconds": 240, + "ground_truth": "By CRT, since 3, 5, 7 are pairwise coprime, a unique solution exists mod 105. From x ≡ 2 (mod 3): x = 3k + 2. Substitute into x ≡ 3 (mod 5): 3k + 2 ≡ 3 (mod 5), so 3k ≡ 1 (mod 5), k ≡ 2 (mod 5) (since 3·2 = 6 ≡ 1). So k = 5m + 2, x = 15m + 8. Substitute into x ≡ 2 (mod 7): 15m + 8 ≡ 2 (mod 7), 15m ≡ -6 (mod 7), m ≡ -6·(15)^{-1} (mod 7). 15 ≡ 1 (mod 7), so m ≡ -6 ≡ 1 (mod 7). m = 7j + 1, x = 105j + 23. The smallest positive x is 23.", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["CRT", "modular-arithmetic", "congruences"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + } +] diff --git a/benchmarks/scientific/chemistry.json b/benchmarks/scientific/chemistry.json new file mode 100644 index 0000000..d1280a7 --- /dev/null +++ b/benchmarks/scientific/chemistry.json @@ -0,0 +1,92 @@ +[ + { + "task_id": "sci-chem-001", + "problem": "Balance the following redox reaction in acidic solution: MnO₄⁻ + Fe²⁺ → Mn²⁺ + Fe³⁺", + "domain": "scientific", + "difficulty": 3, + "timeout_seconds": 180, + "ground_truth": "Half-reactions: Reduction: MnO₄⁻ + 8H⁺ + 5e⁻ → Mn²⁺ + 4H₂O. Oxidation: Fe²⁺ → Fe³⁺ + e⁻. Multiply oxidation by 5 to balance electrons: 5Fe²⁺ → 5Fe³⁺ + 5e⁻. Combined: MnO₄⁻ + 8H⁺ + 5Fe²⁺ → Mn²⁺ + 5Fe³⁺ + 4H₂O. Check: Mn: 1=1 ✓, O: 4=4 ✓, H: 8=8 ✓, Fe: 5=5 ✓, Charge: (-1+8+10)=17 left, (2+15)=17 right ✓.", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["redox", "balancing", "half-reactions", "acidic-solution"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "sci-chem-002", + "problem": "Calculate the pH of a 0.1 M acetic acid solution. The Ka of acetic acid is 1.8 × 10⁻⁵.", + "domain": "scientific", + "difficulty": 3, + "timeout_seconds": 180, + "ground_truth": "CH₃COOH ⇌ CH₃COO⁻ + H⁺. Ka = x²/(0.1 - x) ≈ x²/0.1 (since Ka << C, assume x << 0.1). x² = 1.8 × 10⁻⁶, x = 1.342 × 10⁻³ M. Check: x/0.1 = 1.34% < 5%, so approximation is valid. pH = -log(1.342 × 10⁻³) = 2.87.", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["acid-base", "equilibrium", "pH", "weak-acid"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "sci-chem-003", + "problem": "Explain why the boiling point of water (100°C) is anomalously high compared to H₂S (-60°C), H₂Se (-41°C), and H₂Te (-2°C), despite water having the lowest molecular weight in the group.", + "domain": "scientific", + "difficulty": 4, + "timeout_seconds": 240, + "ground_truth": "Water has an anomalously high boiling point due to extensive hydrogen bonding. Oxygen is highly electronegative (3.44) and small, creating strong O-H···O hydrogen bonds. Each water molecule can form up to 4 hydrogen bonds (2 as donor via its H atoms, 2 as acceptor via its lone pairs), creating a robust 3D hydrogen-bonded network. The other hydrides (H₂S, H₂Se, H₂Te) have much weaker intermolecular forces because S, Se, Te are larger and less electronegative, making hydrogen bonding negligible. Their boiling points follow the expected trend of increasing with molecular weight (London dispersion forces). Water is the anomaly, not the others.", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["intermolecular-forces", "hydrogen-bonding", "boiling-point", "periodic-trends"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "sci-chem-004", + "problem": "Using molecular orbital theory, explain why O₂ is paramagnetic while N₂ is diamagnetic. Draw the MO energy level diagrams for both molecules.", + "domain": "scientific", + "difficulty": 5, + "timeout_seconds": 300, + "ground_truth": "N₂ (14 electrons): MO filling order: σ1s² σ*1s² σ2s² σ*2s² π2p⁴ σ2p². All electrons are paired → diamagnetic. Bond order = (10-4)/2 = 3 (triple bond). O₂ (16 electrons): MO filling order: σ1s² σ*1s² σ2s² σ*2s² σ2p² π2p⁴ π*2p². The two π*2p electrons occupy separate degenerate orbitals with parallel spins (Hund's rule) → two unpaired electrons → paramagnetic. Bond order = (10-6)/2 = 2 (double bond). Key difference: for O₂, the higher nuclear charge causes σ2p to drop below π2p, and the extra electrons enter antibonding π* orbitals unpaired.", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["molecular-orbital-theory", "paramagnetism", "electron-configuration"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "sci-chem-005", + "problem": "A first-order reaction has a half-life of 20 minutes. If you start with 0.5 mol of reactant, how much remains after 1 hour? What is the rate constant?", + "domain": "scientific", + "difficulty": 3, + "timeout_seconds": 180, + "ground_truth": "For first-order reactions: t₁/₂ = ln(2)/k, so k = ln(2)/20 = 0.0347 min⁻¹. After 1 hour (60 min) = 3 half-lives: amount remaining = 0.5 × (1/2)³ = 0.5 × 1/8 = 0.0625 mol. Alternatively: [A] = [A]₀ × e^(-kt) = 0.5 × e^(-0.0347 × 60) = 0.5 × e^(-2.08) = 0.5 × 0.125 = 0.0625 mol.", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["kinetics", "first-order", "half-life", "rate-constant"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "sci-chem-006", + "problem": "Predict the major product of the following reaction: 2-bromobutane + NaOCH₃ in methanol at 60°C. Is the mechanism E1, E2, SN1, or SN2? Explain your reasoning considering the substrate, nucleophile/base, and conditions.", + "domain": "scientific", + "difficulty": 6, + "timeout_seconds": 300, + "ground_truth": "The major product is 2-butene (predominantly trans-2-butene) via E2 elimination. Reasoning: (1) Substrate: 2-bromobutane is a secondary alkyl halide. (2) Nucleophile/base: NaOCH₃ (sodium methoxide) is a strong, bulky base. (3) Strong bases with secondary substrates favor E2 over SN2 (steric hindrance disfavors SN2 at secondary carbon). (4) The elevated temperature (60°C) further favors elimination over substitution. (5) E2 mechanism: concerted, anti-periplanar transition state. (6) Zaitsev's rule: the more substituted alkene (2-butene) is the major product. Trans-2-butene is preferred due to less steric strain in the anti-periplanar transition state. Minor product: 1-butene.", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["organic-chemistry", "elimination", "E2", "zaitsev", "mechanism"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + } +] diff --git a/benchmarks/scientific/physics.json b/benchmarks/scientific/physics.json new file mode 100644 index 0000000..01a9490 --- /dev/null +++ b/benchmarks/scientific/physics.json @@ -0,0 +1,122 @@ +[ + { + "task_id": "sci-physics-001", + "problem": "A ball is thrown vertically upward with an initial velocity of 20 m/s from the top of a 45 m tall building. Ignoring air resistance, how long does it take for the ball to hit the ground? (Use g = 10 m/s².)", + "domain": "scientific", + "difficulty": 3, + "timeout_seconds": 180, + "ground_truth": "Taking upward as positive, with origin at the top of the building: y(t) = 20t - 5t². The ball hits the ground when y = -45. So 20t - 5t² = -45 → 5t² - 20t - 45 = 0 → t² - 4t - 9 = 0. Using the quadratic formula: t = (4 ± √(16+36))/2 = (4 ± √52)/2 = (4 ± 2√13)/2 = 2 ± √13. Since t > 0: t = 2 + √13 ≈ 5.61 seconds.", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["kinematics", "projectile-motion", "quadratic"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "sci-physics-002", + "problem": "Two identical conducting spheres carry charges of +6 μC and -2 μC respectively. They are brought into contact and then separated. What is the charge on each sphere after separation? If they are then placed 0.3 m apart, what is the electrostatic force between them?", + "domain": "scientific", + "difficulty": 3, + "timeout_seconds": 180, + "ground_truth": "When identical conducting spheres touch, charge distributes equally. Total charge: 6 + (-2) = 4 μC. Each sphere gets 2 μC. Force: F = kq₁q₂/r² = (9×10⁹)(2×10⁻⁶)(2×10⁻⁶)/(0.3)² = (9×10⁹)(4×10⁻¹²)/0.09 = 36×10⁻³/0.09 = 0.4 N. The force is repulsive since both charges are positive.", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["electrostatics", "coulomb-law", "charge-distribution"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "sci-physics-003", + "problem": "A heat engine operates between a hot reservoir at 600 K and a cold reservoir at 300 K. A student claims the engine can achieve 70% efficiency. Is this possible?", + "domain": "scientific", + "difficulty": 2, + "timeout_seconds": 120, + "ground_truth": "No. The maximum possible efficiency is given by the Carnot efficiency: η_max = 1 - T_cold/T_hot = 1 - 300/600 = 0.5 = 50%. No real engine can exceed the Carnot efficiency, so 70% is impossible. This violates the second law of thermodynamics.", + "ground_truth_score": 0.0, + "is_trap": true, + "previously_unsolved": false, + "tags": ["trap", "thermodynamics", "carnot", "second-law"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "sci-physics-004", + "problem": "Derive the expression for the electric field at a distance r from an infinite line charge with linear charge density λ using Gauss's law.", + "domain": "scientific", + "difficulty": 5, + "timeout_seconds": 240, + "ground_truth": "Choose a cylindrical Gaussian surface of radius r and length L, coaxial with the line charge. By symmetry, E is radial and constant on the curved surface. Flux through the ends is zero (E ⊥ dA). Gauss's law: ∮ E·dA = Q_enc/ε₀. E(2πrL) = λL/ε₀. Therefore E = λ/(2πε₀r), directed radially outward for positive λ.", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["electromagnetism", "gauss-law", "symmetry", "derivation"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "sci-physics-005", + "problem": "A spaceship travels at 0.8c relative to Earth. A clock on the spaceship measures a journey time of 10 years. How much time has elapsed on Earth? What distance did the spaceship travel as measured by an Earth observer?", + "domain": "scientific", + "difficulty": 5, + "timeout_seconds": 240, + "ground_truth": "Lorentz factor: γ = 1/√(1-v²/c²) = 1/√(1-0.64) = 1/√0.36 = 1/0.6 = 5/3. Time dilation: the proper time is the spaceship time (τ = 10 years). Earth time: t = γτ = (5/3)(10) = 50/3 ≈ 16.67 years. Distance as measured by Earth observer: d = v·t = 0.8c × (50/3) years = 40c/3 ≈ 13.33 light-years.", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["special-relativity", "time-dilation", "lorentz-factor"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "sci-physics-006", + "problem": "A quantum particle is in the state |ψ⟩ = (1/√3)|0⟩ + √(2/3)|1⟩. If you measure in the computational basis, what are the probabilities of each outcome? If you then measure the resulting state again in the same basis, what happens?", + "domain": "scientific", + "difficulty": 4, + "timeout_seconds": 240, + "ground_truth": "Probability of measuring |0⟩: |1/√3|² = 1/3. Probability of measuring |1⟩: |√(2/3)|² = 2/3. These sum to 1, confirming normalization. After measurement, the state collapses to the measured eigenstate. If you measured |0⟩, subsequent measurements always give |0⟩ with probability 1. If you measured |1⟩, subsequent measurements always give |1⟩ with probability 1. This is the projection postulate / wavefunction collapse.", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["quantum-mechanics", "measurement", "probability", "collapse"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "sci-physics-007", + "problem": "Derive the Schwarzschild radius for a non-rotating, uncharged black hole of mass M. Calculate the Schwarzschild radius for a black hole with the mass of the Sun (M_sun = 1.989 × 10^30 kg).", + "domain": "scientific", + "difficulty": 6, + "timeout_seconds": 300, + "ground_truth": "The Schwarzschild radius is obtained from the Schwarzschild metric by finding where the metric coefficient g_tt = 0 (or equivalently, the escape velocity equals c). From energy conservation (Newtonian approximation): (1/2)mc² = GMm/r → r_s = 2GM/c². For the Sun: r_s = 2(6.674×10⁻¹¹)(1.989×10³⁰)/(3×10⁸)² = 2(1.327×10²⁰)/(9×10¹⁶) = 2.954×10³ m ≈ 2.95 km. The exact GR derivation involves solving Einstein's field equations in vacuum with spherical symmetry, yielding the Schwarzschild metric ds² = -(1-r_s/r)c²dt² + (1-r_s/r)⁻¹dr² + r²dΩ², where r_s = 2GM/c².", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["general-relativity", "black-holes", "schwarzschild", "derivation"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "sci-physics-008", + "problem": "Using the path integral formulation, explain why in the classical limit (ℏ → 0), quantum mechanics reduces to classical mechanics. Specifically, explain the role of the stationary phase approximation.", + "domain": "scientific", + "difficulty": 9, + "timeout_seconds": 540, + "ground_truth": "In Feynman's path integral formulation, the quantum propagator is K = ∫ D[x(t)] exp(iS[x]/ℏ), where S[x] is the classical action. As ℏ → 0, the phase S/ℏ oscillates rapidly for paths where S varies. Contributions from neighboring paths with different actions cancel due to destructive interference. The only surviving contributions come from paths where S is stationary (δS = 0), which is precisely Hamilton's principle of classical mechanics. This is the stationary phase approximation: the integral is dominated by the classical path x_cl(t) satisfying the Euler-Lagrange equations. Quantum corrections appear as fluctuations around this classical path, suppressed by powers of ℏ. The leading correction gives the Van Vleck determinant: K ≈ √(det(-∂²S/∂x∂x')/(2πiℏ)) exp(iS_cl/ℏ).", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["quantum-mechanics", "path-integral", "classical-limit", "stationary-phase"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + } +] diff --git a/benchmarks/strategic/game_theory.json b/benchmarks/strategic/game_theory.json new file mode 100644 index 0000000..d02a31b --- /dev/null +++ b/benchmarks/strategic/game_theory.json @@ -0,0 +1,122 @@ +[ + { + "task_id": "strat-game-001", + "problem": "In the standard Prisoner's Dilemma with payoffs (years in prison): Both cooperate: 1 year each. Both defect: 3 years each. One defects while other cooperates: defector gets 0 years, cooperator gets 5 years. Find all Nash equilibria of this one-shot game.", + "domain": "strategic", + "difficulty": 2, + "timeout_seconds": 120, + "ground_truth": "The unique Nash equilibrium is (Defect, Defect) with payoff (3, 3). Proof: If Player 2 cooperates, Player 1 prefers defecting (0 < 1). If Player 2 defects, Player 1 still prefers defecting (3 < 5). Defect is a strictly dominant strategy for both players. Despite (Cooperate, Cooperate) being Pareto-superior, it is not a Nash equilibrium because each player has a unilateral incentive to deviate.", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["prisoners-dilemma", "nash-equilibrium", "dominant-strategy"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "strat-game-002", + "problem": "In a sealed-bid first-price auction with two bidders who each have independent private values uniformly distributed on [0, 1], find the Bayesian Nash Equilibrium bidding strategy.", + "domain": "strategic", + "difficulty": 7, + "timeout_seconds": 360, + "ground_truth": "The symmetric BNE bidding strategy is b(v) = v/2. Derivation: Assume bidder 2 uses strategy b(v₂) = αv₂. Bidder 1 with value v₁ chooses bid b to maximize expected payoff: (v₁ - b) · P(b > αv₂) = (v₁ - b) · P(v₂ < b/α) = (v₁ - b)(b/α). Maximize over b: d/db[(v₁ - b)(b/α)] = (v₁ - 2b)/α = 0 → b = v₁/2. By symmetry, α = 1/2. So b(v) = v/2. Each bidder shades their bid by half their value. Revenue = E[max(v₁/2, v₂/2)] = E[max(v₁,v₂)]/2 = (2/3)/2 = 1/3.", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["auction-theory", "bayesian-nash", "first-price", "bid-shading"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "strat-game-003", + "problem": "Consider a game where two players simultaneously choose an integer from 1 to 100. The player whose number is closest to 2/3 of the average of both numbers wins a prize. If tied, they split it. What is the Nash equilibrium?", + "domain": "strategic", + "difficulty": 5, + "timeout_seconds": 300, + "ground_truth": "The unique Nash equilibrium is both players choosing 1. Reasoning via iterated elimination of dominated strategies: (1) The maximum average is 100, so 2/3 of max average is 66.67. Any number > 67 is dominated. (2) Now max average is 67, so 2/3 × 67 ≈ 44.67, eliminating numbers > 45. (3) Continue: 2/3 × 45 = 30, then 20, then 14, then 9, then 6, then 4, then 3, then 2, then 1. The iteration converges to 1. This is the 'p-beauty contest' game, a classic demonstration of levels of reasoning. In practice, experimental results show most players choose numbers around 22-33 (corresponding to 1-2 levels of reasoning).", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["beauty-contest", "iterated-dominance", "levels-of-reasoning"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "strat-game-004", + "problem": "In the Ultimatum Game, Player 1 proposes a split of $100 and Player 2 can accept or reject (both get $0 if rejected). A student claims the Nash equilibrium is a 50-50 split because it is 'fair.' Evaluate this claim.", + "domain": "strategic", + "difficulty": 4, + "timeout_seconds": 240, + "ground_truth": "The claim is INCORRECT. In the subgame-perfect Nash equilibrium, Player 1 offers the minimum positive amount (e.g., $1 or $0.01) and Player 2 accepts, because any positive amount is better than $0. Offering $99/$1 (or even $100/$0 with Player 2 indifferent) are all Nash equilibria, but the subgame-perfect one has Player 1 offering the minimum acceptable amount. The 50-50 split IS a Nash equilibrium (Player 2's strategy: reject anything less than $50), but it is not subgame-perfect because Player 2's threat to reject $49 is not credible. Note: experimentally, offers below 30% are often rejected, showing human behavior deviates from game-theoretic predictions due to fairness preferences.", + "ground_truth_score": 0.0, + "is_trap": true, + "previously_unsolved": false, + "tags": ["trap", "ultimatum-game", "subgame-perfect", "fairness", "behavioral"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "strat-game-005", + "problem": "Formulate the Tragedy of the Commons as a game-theoretic model. N farmers share a common pasture. Each farmer i chooses how many cattle g_i to graze (0 ≤ g_i ≤ G_max). Total grazing G = Σg_i. Each cow's value decreases with total grazing: v(G) = max(0, a - bG). Farmer i's payoff is g_i · v(G) - c · g_i where c is the cost per cow. Find the Nash equilibrium and compare it to the social optimum.", + "domain": "strategic", + "difficulty": 7, + "timeout_seconds": 360, + "ground_truth": "Nash equilibrium: Each farmer maximizes g_i(a - b·G) - c·g_i. FOC: a - b·G - b·g_i - c = 0. By symmetry g_i = g* for all i, so a - bNg* - bg* - c = 0 → g* = (a-c)/(b(N+1)). Total grazing: G_NE = N(a-c)/(b(N+1)). Social optimum: maximize Σg_i(a-bG) - cΣg_i = G(a-bG-c). FOC: a - 2bG - c = 0 → G_SO = (a-c)/(2b). Ratio: G_NE/G_SO = 2N/(N+1). As N → ∞, G_NE → 2·G_SO: the commons is overgrazed by a factor approaching 2. The tragedy arises because each farmer ignores the negative externality their cattle impose on others.", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["tragedy-of-commons", "externality", "nash-equilibrium", "social-optimum"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "strat-game-006", + "problem": "In a three-player simultaneous game, each player chooses Rock, Paper, or Scissors. If all three choose the same or all choose different, no one wins. If exactly two players match, the non-matching player wins or loses by standard RPS rules against the pair. Find a symmetric Nash equilibrium.", + "domain": "scientific", + "difficulty": 6, + "timeout_seconds": 300, + "ground_truth": "The symmetric Nash equilibrium is the uniform mixed strategy: each player plays Rock, Paper, and Scissors each with probability 1/3. Proof: By the symmetry of the game across both players and actions (any permutation of {R,P,S} combined with corresponding relabeling gives the same game), any symmetric NE must assign equal probability to each action. To verify: given opponents each play uniformly, any pure strategy gives the same expected payoff (by symmetry), so mixing uniformly is a best response. This is the unique symmetric NE.", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["mixed-strategy", "three-player", "symmetric-game", "RPS"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "strat-game-007", + "problem": "Consider a Stackelberg duopoly where Firm 1 (leader) chooses quantity q₁ first, then Firm 2 (follower) observes q₁ and chooses q₂. Market price is P = 100 - q₁ - q₂. Both firms have marginal cost c = 10. Find the subgame-perfect equilibrium quantities, price, and profits.", + "domain": "strategic", + "difficulty": 6, + "timeout_seconds": 300, + "ground_truth": "Solve by backward induction. Firm 2's best response: max_q₂ (100 - q₁ - q₂ - 10)q₂. FOC: 90 - q₁ - 2q₂ = 0 → q₂* = (90 - q₁)/2. Firm 1 anticipates this: max_q₁ (100 - q₁ - (90-q₁)/2 - 10)q₁ = (100 - q₁ - 45 + q₁/2 - 10)q₁ = (45 - q₁/2)q₁. FOC: 45 - q₁ = 0 → q₁* = 45. Then q₂* = (90-45)/2 = 22.5. Price: P = 100 - 45 - 22.5 = 32.5. Profits: π₁ = (32.5-10)(45) = 1012.5, π₂ = (32.5-10)(22.5) = 506.25. The leader earns twice the follower's profit — the first-mover advantage.", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["stackelberg", "duopoly", "backward-induction", "first-mover"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "strat-game-008", + "problem": "Explain the Gibbard-Satterthwaite theorem and its implications for voting system design. Can any voting system with 3+ candidates be strategy-proof?", + "domain": "strategic", + "difficulty": 8, + "timeout_seconds": 420, + "ground_truth": "The Gibbard-Satterthwaite theorem states that for elections with 3 or more candidates, any voting rule that is: (1) surjective (every candidate can win under some preference profile), and (2) strategy-proof (no voter can benefit by misrepresenting their preferences) must be dictatorial (determined by a single voter's preferences). Implications: No non-dictatorial, surjective voting system with 3+ candidates is immune to strategic voting. This is closely related to Arrow's impossibility theorem. Practical consequences: all real voting systems (plurality, Borda count, ranked choice, Condorcet methods) are manipulable. System designers must accept some degree of manipulability and choose systems where manipulation is computationally hard or requires precise information about others' preferences. Exceptions only arise in restricted domains (e.g., single-peaked preferences allow strategy-proof rules like median voter mechanisms).", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["social-choice", "impossibility-theorem", "strategy-proof", "voting"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + } +] diff --git a/benchmarks/strategic/optimization.json b/benchmarks/strategic/optimization.json new file mode 100644 index 0000000..fe4391f --- /dev/null +++ b/benchmarks/strategic/optimization.json @@ -0,0 +1,92 @@ +[ + { + "task_id": "strat-optim-001", + "problem": "A factory produces two products A and B. Product A requires 2 hours of machining and 1 hour of assembly, yielding $40 profit. Product B requires 1 hour of machining and 3 hours of assembly, yielding $60 profit. Available: 100 hours of machining and 120 hours of assembly per week. Formulate and solve this linear programming problem.", + "domain": "strategic", + "difficulty": 3, + "timeout_seconds": 180, + "ground_truth": "Maximize 40x_A + 60x_B subject to: 2x_A + x_B ≤ 100 (machining), x_A + 3x_B ≤ 120 (assembly), x_A, x_B ≥ 0. Corner points: (0,0): profit = 0. (50,0): profit = 2000. (0,40): profit = 2400. Intersection of constraints: 2x_A + x_B = 100 and x_A + 3x_B = 120. From first: x_B = 100-2x_A. Substitute: x_A + 3(100-2x_A) = 120 → x_A + 300-6x_A = 120 → -5x_A = -180 → x_A = 36, x_B = 28. Profit = 40(36) + 60(28) = 1440 + 1680 = 3120. Maximum profit: $3,120 at (x_A, x_B) = (36, 28).", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["linear-programming", "simplex", "manufacturing"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "strat-optim-002", + "problem": "A company must decide how to allocate a $1 million marketing budget across 4 channels: TV, Social Media, Search Ads, and Email. Historical data suggests diminishing returns modeled as: Revenue_i = a_i · sqrt(budget_i). Given a_TV = 500, a_Social = 400, a_Search = 600, a_Email = 300, find the optimal allocation to maximize total revenue.", + "domain": "strategic", + "difficulty": 5, + "timeout_seconds": 300, + "ground_truth": "Maximize Σ a_i·√(x_i) subject to Σ x_i = 1,000,000 and x_i ≥ 0. Using Lagrange multipliers: d/dx_i [a_i·√(x_i)] = λ for all i. a_i/(2√(x_i)) = λ → √(x_i) = a_i/(2λ) → x_i = a_i²/(4λ²). Budget constraint: Σ a_i²/(4λ²) = 1,000,000. Σ a_i² = 500² + 400² + 600² + 300² = 250,000 + 160,000 + 360,000 + 90,000 = 860,000. 4λ² = 860,000/1,000,000 = 0.86. x_i = a_i²/0.86. x_TV = 250,000/0.86 = $290,698. x_Social = 160,000/0.86 = $186,047. x_Search = 360,000/0.86 = $418,605. x_Email = 90,000/0.86 = $104,651. Total ≈ $1,000,001 (rounding). Allocate proportional to the square of the effectiveness coefficient.", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["optimization", "lagrange-multipliers", "budget-allocation", "diminishing-returns"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "strat-optim-003", + "problem": "Solve the Traveling Salesman Problem for 5 cities with the following distance matrix (symmetric):\n A-B: 10, A-C: 15, A-D: 20, A-E: 25\n B-C: 35, B-D: 25, B-E: 30\n C-D: 30, C-E: 20\n D-E: 15\nFind the shortest Hamiltonian cycle.", + "domain": "strategic", + "difficulty": 4, + "timeout_seconds": 240, + "ground_truth": "With 5 cities, there are (5-1)!/2 = 12 distinct tours. Enumerate key candidates: A→B→D→E→C→A: 10+25+15+20+15 = 85. A→B→C→E→D→A: 10+35+20+15+20 = 100. A→C→E→D→B→A: 15+20+15+25+10 = 85. A→B→D→E→C→A: 85 (same as first). A→C→B→D→E→A: 15+35+25+15+25 = 115. A→B→E→C→D→A: 10+30+20+30+20 = 110. A→D→E→C→B→A: 20+15+20+35+10 = 100. A→B→D→C→E→A: 10+25+30+20+25 = 110. The optimal tour is A→B→D→E→C→A (or its reverse) with total distance 85.", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["TSP", "combinatorial-optimization", "enumeration"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "strat-optim-004", + "problem": "A warehouse must fulfill orders from 3 customers using 2 supply sources. Supply: S1 has 300 units, S2 has 200 units. Demand: D1 needs 150, D2 needs 200, D3 needs 150. Shipping costs per unit: S1→D1: $4, S1→D2: $8, S1→D3: $1, S2→D1: $7, S2→D2: $2, S2→D3: $5. Find the minimum cost transportation plan.", + "domain": "strategic", + "difficulty": 5, + "timeout_seconds": 300, + "ground_truth": "This is a balanced transportation problem (total supply = 500, total demand = 500). Using the Vogel's Approximation Method or solving the LP: Optimal allocation: S1→D1: 150, S1→D2: 0, S1→D3: 150 (using 300 from S1). S2→D1: 0, S2→D2: 200, S2→D3: 0 (using 200 from S2). Cost = 150(4) + 150(1) + 200(2) = 600 + 150 + 400 = $1,150. Verify: All supply used (S1: 150+150=300, S2: 200). All demand met (D1: 150, D2: 200, D3: 150). This is optimal because we assign the cheapest routes first while respecting constraints.", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["transportation-problem", "linear-programming", "logistics"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "strat-optim-005", + "problem": "Describe the simulated annealing algorithm for combinatorial optimization. What is the role of the temperature parameter, and how does the cooling schedule affect solution quality? Compare to hill climbing.", + "domain": "strategic", + "difficulty": 5, + "timeout_seconds": 300, + "ground_truth": "Simulated Annealing (SA): (1) Start with initial solution S, temperature T. (2) Generate neighbor S'. (3) If f(S') < f(S) (better), accept. If worse, accept with probability exp(-(f(S')-f(S))/T). (4) Decrease T according to cooling schedule. (5) Repeat until frozen. Temperature role: at high T, accepts worse solutions freely (exploration). As T→0, only accepts improvements (exploitation). This allows escaping local optima, unlike hill climbing which always gets stuck. Cooling schedule: geometric (T' = αT, α ≈ 0.95), linear, logarithmic. Slower cooling → better solutions but more computation. Theoretically, logarithmic cooling (T_k = C/ln(k)) guarantees convergence to global optimum, but is impractically slow. Hill climbing is a special case of SA with T = 0: greedy, fast, but trapped by local optima. SA provides probabilistic guarantee of finding near-optimal solutions for NP-hard problems.", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["simulated-annealing", "metaheuristic", "combinatorial-optimization", "comparison"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + }, + { + "task_id": "strat-optim-006", + "problem": "Find the minimum of f(x, y) = (x-1)² + (y-2)² + (x-y)² using gradient descent. Derive the gradient analytically and describe the convergence properties.", + "domain": "strategic", + "difficulty": 4, + "timeout_seconds": 240, + "ground_truth": "Gradient: ∂f/∂x = 2(x-1) + 2(x-y) = 4x - 2y - 2. ∂f/∂y = 2(y-2) + 2(y-x)(-1) = -2x + 4y - 4. Wait, ∂f/∂y = 2(y-2) + 2(x-y)(-1) = 2y - 4 - 2x + 2y = -2x + 4y - 4. Setting gradient to zero: 4x - 2y = 2 and -2x + 4y = 4. From first equation: 2x - y = 1 → y = 2x - 1. Substitute: -2x + 4(2x-1) = 4 → -2x + 8x - 4 = 4 → 6x = 8 → x = 4/3. y = 2(4/3) - 1 = 5/3. Minimum at (4/3, 5/3). f(4/3, 5/3) = (1/3)² + (-1/3)² + (-1/3)² = 1/9 + 1/9 + 1/9 = 1/3. The Hessian is [[4,-2],[-2,4]] with eigenvalues 2 and 6, both positive → confirmed minimum. Optimal step size for gradient descent: α = 2/(λ_max + λ_min) = 2/8 = 1/4. Convergence rate: (λ_max-λ_min)/(λ_max+λ_min) = 4/8 = 1/2 per iteration.", + "ground_truth_score": 1.0, + "is_trap": false, + "previously_unsolved": false, + "tags": ["gradient-descent", "convex-optimization", "convergence-rate", "analytical"], + "source": "benchmark", + "author": "ReasonForge Team", + "created_at": "2026-01-15" + } +] diff --git a/docker/Dockerfile.gateway b/docker/Dockerfile.gateway new file mode 100644 index 0000000..4ba9d26 --- /dev/null +++ b/docker/Dockerfile.gateway @@ -0,0 +1,26 @@ +FROM python:3.12-slim + +LABEL maintainer="ReasonForge Team" +LABEL description="ReasonForge API Gateway" + +ENV PYTHONUNBUFFERED=1 \ + PYTHONDONTWRITEBYTECODE=1 + +WORKDIR /app + +# Install system dependencies +RUN apt-get update && \ + apt-get install -y --no-install-recommends gcc && \ + rm -rf /var/lib/apt/lists/* + +# Install Python dependencies +COPY requirements.txt ./ +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application code +COPY reasonforge/ ./reasonforge/ +COPY api/ ./api/ + +EXPOSE 8000 + +CMD ["uvicorn", "api.server:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/docker/Dockerfile.miner b/docker/Dockerfile.miner new file mode 100644 index 0000000..64b1b48 --- /dev/null +++ b/docker/Dockerfile.miner @@ -0,0 +1,26 @@ +FROM python:3.12-slim + +LABEL maintainer="ReasonForge Team" +LABEL description="ReasonForge Miner Node" + +ENV PYTHONUNBUFFERED=1 \ + PYTHONDONTWRITEBYTECODE=1 + +WORKDIR /app + +# Install system dependencies +RUN apt-get update && \ + apt-get install -y --no-install-recommends gcc g++ && \ + rm -rf /var/lib/apt/lists/* + +# Install Python dependencies +COPY requirements.txt requirements-miner.txt ./ +RUN pip install --no-cache-dir -r requirements.txt -r requirements-miner.txt + +# Copy application code +COPY reasonforge/ ./reasonforge/ +COPY neurons/miner.py ./neurons/miner.py + +EXPOSE 9091 + +ENTRYPOINT ["python", "neurons/miner.py"] diff --git a/docker/Dockerfile.sandbox b/docker/Dockerfile.sandbox new file mode 100644 index 0000000..84b2367 --- /dev/null +++ b/docker/Dockerfile.sandbox @@ -0,0 +1,23 @@ +FROM python:3.12-slim + +LABEL maintainer="ReasonForge Team" +LABEL description="ReasonForge Sandbox - Isolated execution environment" + +ENV PYTHONUNBUFFERED=1 \ + PYTHONDONTWRITEBYTECODE=1 + +# Install only allowed scientific packages +RUN pip install --no-cache-dir numpy scipy sympy + +# Create non-root sandbox user +RUN useradd --create-home --shell /bin/bash sandbox + +# Restrict writable directories to /tmp only +RUN chmod 755 /home/sandbox && \ + chown sandbox:sandbox /tmp + +USER sandbox +WORKDIR /tmp + +# No network access should be configured via docker-compose/network policy +ENTRYPOINT ["python"] diff --git a/docker/Dockerfile.validator b/docker/Dockerfile.validator new file mode 100644 index 0000000..08a098a --- /dev/null +++ b/docker/Dockerfile.validator @@ -0,0 +1,35 @@ +FROM python:3.12-slim + +LABEL maintainer="ReasonForge Team" +LABEL description="ReasonForge Validator Node" + +ENV PYTHONUNBUFFERED=1 \ + PYTHONDONTWRITEBYTECODE=1 + +WORKDIR /app + +# Install system dependencies +RUN apt-get update && \ + apt-get install -y --no-install-recommends gcc g++ curl git && \ + rm -rf /var/lib/apt/lists/* + +# Install Python dependencies +COPY requirements.txt requirements-validator.txt ./ +RUN pip install --no-cache-dir -r requirements.txt -r requirements-validator.txt + +# Optional: Install Lean4 for formal verification +ARG INSTALL_LEAN=false +RUN if [ "$INSTALL_LEAN" = "true" ]; then \ + curl -sSf https://raw.githubusercontent.com/leanprover/elan/master/elan-init.sh | bash -s -- -y --default-toolchain leanprover/lean4:stable && \ + echo 'export PATH="$HOME/.elan/bin:$PATH"' >> /etc/profile.d/lean.sh; \ + fi +ENV PATH="/root/.elan/bin:${PATH}" + +# Copy application code +COPY reasonforge/ ./reasonforge/ +COPY neurons/validator.py ./neurons/validator.py +COPY benchmarks/ ./benchmarks/ + +EXPOSE 9092 8092 + +ENTRYPOINT ["python", "neurons/validator.py"] diff --git a/docker/docker-compose.localnet.yml b/docker/docker-compose.localnet.yml new file mode 100644 index 0000000..30b0c18 --- /dev/null +++ b/docker/docker-compose.localnet.yml @@ -0,0 +1,81 @@ +version: "3.8" + +services: + validator: + build: + context: .. + dockerfile: docker/Dockerfile.validator + container_name: reasonforge-validator-local + ports: + - "9092:9092" + - "8092:8092" + environment: + - NETUID=1 + - SUBTENSOR_NETWORK=local + - SUBTENSOR_CHAIN_ENDPOINT=ws://subtensor:9944 + - WALLET_NAME=owner + - WALLET_HOTKEY=validator + - LOGGING_LEVEL=DEBUG + volumes: + - wallet-data:/root/.bittensor + networks: + - localnet + + miner: + build: + context: .. + dockerfile: docker/Dockerfile.miner + container_name: reasonforge-miner-local + ports: + - "9091:9091" + environment: + - NETUID=1 + - SUBTENSOR_NETWORK=local + - SUBTENSOR_CHAIN_ENDPOINT=ws://subtensor:9944 + - WALLET_NAME=owner + - WALLET_HOTKEY=miner + - LOGGING_LEVEL=DEBUG + volumes: + - wallet-data:/root/.bittensor + networks: + - localnet + + gateway: + build: + context: .. + dockerfile: docker/Dockerfile.gateway + container_name: reasonforge-gateway-local + ports: + - "8000:8000" + environment: + - NETUID=1 + - SUBTENSOR_NETWORK=local + - SUBTENSOR_CHAIN_ENDPOINT=ws://subtensor:9944 + - LOGGING_LEVEL=DEBUG + networks: + - localnet + depends_on: + - validator + + sandbox: + build: + context: .. + dockerfile: docker/Dockerfile.sandbox + container_name: reasonforge-sandbox-local + read_only: true + tmpfs: + - /tmp:size=100M + security_opt: + - no-new-privileges:true + networks: + - sandbox-net + +volumes: + wallet-data: + +networks: + localnet: + driver: bridge + sandbox-net: + driver: bridge + internal: true diff --git a/docker/docker-compose.monitoring.yml b/docker/docker-compose.monitoring.yml new file mode 100644 index 0000000..ad172e2 --- /dev/null +++ b/docker/docker-compose.monitoring.yml @@ -0,0 +1,43 @@ +version: "3.8" + +services: + prometheus: + image: prom/prometheus:v2.51.0 + container_name: reasonforge-prometheus + restart: unless-stopped + ports: + - "${PROMETHEUS_PORT:-9090}:9090" + volumes: + - ../monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - ../monitoring/alerts:/etc/prometheus/alerts:ro + - prometheus-data:/prometheus + command: + - "--config.file=/etc/prometheus/prometheus.yml" + - "--storage.tsdb.retention.time=30d" + networks: + - monitoring + + grafana: + image: grafana/grafana:10.4.0 + container_name: reasonforge-grafana + restart: unless-stopped + ports: + - "${GRAFANA_PORT:-3000}:3000" + environment: + - GF_SECURITY_ADMIN_USER=${GRAFANA_USER:-admin} + - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_PASSWORD:-admin} + volumes: + - ../monitoring/grafana/dashboards:/var/lib/grafana/dashboards:ro + - grafana-data:/var/lib/grafana + networks: + - monitoring + depends_on: + - prometheus + +volumes: + prometheus-data: + grafana-data: + +networks: + monitoring: + driver: bridge diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml new file mode 100644 index 0000000..abe5396 --- /dev/null +++ b/docker/docker-compose.yml @@ -0,0 +1,133 @@ +version: "3.8" + +services: + validator: + build: + context: .. + dockerfile: docker/Dockerfile.validator + args: + INSTALL_LEAN: "${INSTALL_LEAN:-false}" + container_name: reasonforge-validator + restart: unless-stopped + ports: + - "${VALIDATOR_PORT:-9092}:9092" + - "${VALIDATOR_METRICS_PORT:-8092}:8092" + environment: + - NETUID=${NETUID:-1} + - SUBTENSOR_NETWORK=${SUBTENSOR_NETWORK:-finney} + - SUBTENSOR_CHAIN_ENDPOINT=${SUBTENSOR_CHAIN_ENDPOINT:-} + - WALLET_NAME=${WALLET_NAME:-default} + - WALLET_HOTKEY=${WALLET_HOTKEY:-default} + - LOGGING_LEVEL=${LOGGING_LEVEL:-INFO} + volumes: + - wallet-data:/root/.bittensor + - validator-data:/app/data + networks: + - reasonforge-net + depends_on: + - sandbox + + miner: + build: + context: .. + dockerfile: docker/Dockerfile.miner + container_name: reasonforge-miner + restart: unless-stopped + ports: + - "${MINER_PORT:-9091}:9091" + environment: + - NETUID=${NETUID:-1} + - SUBTENSOR_NETWORK=${SUBTENSOR_NETWORK:-finney} + - SUBTENSOR_CHAIN_ENDPOINT=${SUBTENSOR_CHAIN_ENDPOINT:-} + - WALLET_NAME=${WALLET_NAME:-default} + - WALLET_HOTKEY=${WALLET_HOTKEY:-default} + - LOGGING_LEVEL=${LOGGING_LEVEL:-INFO} + volumes: + - wallet-data:/root/.bittensor + - miner-data:/app/data + networks: + - reasonforge-net + + gateway: + build: + context: .. + dockerfile: docker/Dockerfile.gateway + container_name: reasonforge-gateway + restart: unless-stopped + ports: + - "${GATEWAY_PORT:-8000}:8000" + environment: + - NETUID=${NETUID:-1} + - SUBTENSOR_NETWORK=${SUBTENSOR_NETWORK:-finney} + - SUBTENSOR_CHAIN_ENDPOINT=${SUBTENSOR_CHAIN_ENDPOINT:-} + - LOGGING_LEVEL=${LOGGING_LEVEL:-INFO} + networks: + - reasonforge-net + depends_on: + - validator + + sandbox: + build: + context: .. + dockerfile: docker/Dockerfile.sandbox + container_name: reasonforge-sandbox + restart: unless-stopped + read_only: true + tmpfs: + - /tmp:size=100M + security_opt: + - no-new-privileges:true + deploy: + resources: + limits: + memory: 512M + cpus: "1.0" + networks: + - sandbox-net + + prometheus: + image: prom/prometheus:v2.51.0 + container_name: reasonforge-prometheus + restart: unless-stopped + ports: + - "${PROMETHEUS_PORT:-9090}:9090" + volumes: + - ../monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - ../monitoring/alerts:/etc/prometheus/alerts:ro + - prometheus-data:/prometheus + command: + - "--config.file=/etc/prometheus/prometheus.yml" + - "--storage.tsdb.retention.time=30d" + networks: + - reasonforge-net + + grafana: + image: grafana/grafana:10.4.0 + container_name: reasonforge-grafana + restart: unless-stopped + ports: + - "${GRAFANA_PORT:-3000}:3000" + environment: + - GF_SECURITY_ADMIN_USER=${GRAFANA_USER:-admin} + - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_PASSWORD:-admin} + volumes: + - ../monitoring/grafana/dashboards:/var/lib/grafana/dashboards:ro + - grafana-data:/var/lib/grafana + networks: + - reasonforge-net + depends_on: + - prometheus + +volumes: + wallet-data: + validator-data: + miner-data: + prometheus-data: + grafana-data: + +networks: + reasonforge-net: + driver: bridge + sandbox-net: + driver: bridge + internal: true diff --git a/min_compute.yml b/min_compute.yml new file mode 100644 index 0000000..cde45e0 --- /dev/null +++ b/min_compute.yml @@ -0,0 +1,27 @@ +# ReasonForge - Minimum Compute Requirements +min_compute: + miner: + cpu: 4 + ram_gb: 16 + storage_gb: 50 + gpu: optional # Depends on LLM backend + bandwidth_mbps: 100 + validator: + cpu: 8 + ram_gb: 32 + storage_gb: 100 + gpu: recommended # For embedding model + bandwidth_mbps: 200 + +# Subnet hyperparameters +subnet: + tempo: 360 # Blocks per epoch (~72 minutes) + immunity_period: 7200 # New neuron protection (~24 hours) + max_miners: 192 + max_validators: 64 + min_validator_stake: 1000 # TAO + weights_rate_limit: 100 # Blocks between weight updates + weights_version_key: 1 + adjustment_alpha: 0.7 + difficulty: 10000000 # POW registration difficulty + registration_cost: 0.1 # TAO burn registration diff --git a/monitoring/alerts/rules.yml b/monitoring/alerts/rules.yml new file mode 100644 index 0000000..3866011 --- /dev/null +++ b/monitoring/alerts/rules.yml @@ -0,0 +1,74 @@ +groups: + - name: reasonforge_alerts + rules: + - alert: ValidatorDown + expr: up{job="reasonforge-validator"} == 0 + for: 2m + labels: + severity: critical + annotations: + summary: "Validator node is down" + description: "The ReasonForge validator has been unreachable for more than 2 minutes." + + - alert: MinerDown + expr: up{job="reasonforge-miner"} == 0 + for: 2m + labels: + severity: critical + annotations: + summary: "Miner node is down" + description: "The ReasonForge miner has been unreachable for more than 2 minutes." + + - alert: GatewayDown + expr: up{job="reasonforge-gateway"} == 0 + for: 2m + labels: + severity: warning + annotations: + summary: "API Gateway is down" + description: "The ReasonForge API gateway has been unreachable for more than 2 minutes." + + - alert: HighErrorRate + expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.05 + for: 5m + labels: + severity: warning + annotations: + summary: "High error rate detected" + description: "More than 5% of requests are returning 5xx errors over the last 5 minutes." + + - alert: HighLatency + expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 10 + for: 5m + labels: + severity: warning + annotations: + summary: "High request latency" + description: "95th percentile request latency is above 10 seconds for the last 5 minutes." + + - alert: LowMinerResponseRate + expr: rate(miner_responses_total[10m]) == 0 + for: 10m + labels: + severity: warning + annotations: + summary: "No miner responses" + description: "The miner has not produced any responses in the last 10 minutes." + + - alert: ValidatorScoreDropped + expr: delta(validator_average_score[30m]) < -0.2 + for: 5m + labels: + severity: warning + annotations: + summary: "Validator average score dropped significantly" + description: "The average validator score has dropped by more than 0.2 in the last 30 minutes." + + - alert: HighMemoryUsage + expr: process_resident_memory_bytes > 4e9 + for: 5m + labels: + severity: warning + annotations: + summary: "High memory usage" + description: "Process memory usage exceeds 4GB." diff --git a/monitoring/grafana/dashboards/miner_performance.json b/monitoring/grafana/dashboards/miner_performance.json new file mode 100644 index 0000000..5c0bb43 --- /dev/null +++ b/monitoring/grafana/dashboards/miner_performance.json @@ -0,0 +1,233 @@ +{ + "annotations": { + "list": [] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": 0 }, + { "color": "yellow", "value": 0.3 }, + { "color": "green", "value": 0.7 } + ] + } + } + }, + "gridPos": { "h": 4, "w": 8, "x": 0, "y": 0 }, + "id": 1, + "options": { + "colorMode": "background", + "graphMode": "area", + "reduceOptions": { "calcs": ["lastNotNull"] } + }, + "title": "Current Miner Score", + "type": "stat", + "targets": [ + { + "expr": "miner_score", + "legendFormat": "UID {{uid}}" + } + ] + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { "h": 4, "w": 8, "x": 8, "y": 0 }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "area", + "reduceOptions": { "calcs": ["lastNotNull"] } + }, + "title": "Total Responses", + "type": "stat", + "targets": [ + { + "expr": "miner_responses_total", + "legendFormat": "Responses" + } + ] + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "unit": "s" + } + }, + "gridPos": { "h": 4, "w": 8, "x": 16, "y": 0 }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "area", + "reduceOptions": { "calcs": ["lastNotNull"] } + }, + "title": "Avg Response Time", + "type": "stat", + "targets": [ + { + "expr": "rate(miner_response_duration_seconds_sum[5m]) / rate(miner_response_duration_seconds_count[5m])", + "legendFormat": "Avg Latency" + } + ] + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 }, + "id": 4, + "title": "Miner Score Over Time", + "type": "timeseries", + "targets": [ + { + "expr": "miner_score", + "legendFormat": "UID {{uid}}" + } + ] + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 }, + "id": 5, + "title": "Response Rate", + "type": "timeseries", + "targets": [ + { + "expr": "rate(miner_responses_total[5m])", + "legendFormat": "UID {{uid}}" + } + ] + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "unit": "s" + } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 12 }, + "id": 6, + "title": "Response Duration (p50 / p95 / p99)", + "type": "timeseries", + "targets": [ + { + "expr": "histogram_quantile(0.50, rate(miner_response_duration_seconds_bucket[5m]))", + "legendFormat": "p50" + }, + { + "expr": "histogram_quantile(0.95, rate(miner_response_duration_seconds_bucket[5m]))", + "legendFormat": "p95" + }, + { + "expr": "histogram_quantile(0.99, rate(miner_response_duration_seconds_bucket[5m]))", + "legendFormat": "p99" + } + ] + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 12 }, + "id": 7, + "title": "Verification Results", + "type": "timeseries", + "targets": [ + { + "expr": "rate(miner_verification_passed_total[5m])", + "legendFormat": "Passed" + }, + { + "expr": "rate(miner_verification_failed_total[5m])", + "legendFormat": "Failed" + } + ] + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "unit": "bytes" + } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 20 }, + "id": 8, + "title": "Miner Memory Usage", + "type": "timeseries", + "targets": [ + { + "expr": "process_resident_memory_bytes{job=\"reasonforge-miner\"}", + "legendFormat": "RSS" + }, + { + "expr": "process_virtual_memory_bytes{job=\"reasonforge-miner\"}", + "legendFormat": "Virtual" + } + ] + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "unit": "percentunit" + } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 20 }, + "id": 9, + "title": "Miner CPU Usage", + "type": "timeseries", + "targets": [ + { + "expr": "rate(process_cpu_seconds_total{job=\"reasonforge-miner\"}[5m])", + "legendFormat": "CPU Usage" + } + ] + } + ], + "schemaVersion": 39, + "tags": ["reasonforge", "miner", "performance"], + "templating": { "list": [] }, + "time": { "from": "now-6h", "to": "now" }, + "timepicker": {}, + "timezone": "browser", + "title": "ReasonForge - Miner Performance", + "uid": "reasonforge-miner-performance", + "version": 1 +} diff --git a/monitoring/grafana/dashboards/subnet_overview.json b/monitoring/grafana/dashboards/subnet_overview.json new file mode 100644 index 0000000..2669884 --- /dev/null +++ b/monitoring/grafana/dashboards/subnet_overview.json @@ -0,0 +1,202 @@ +{ + "annotations": { + "list": [] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": 0 }, + { "color": "green", "value": 1 } + ] + } + } + }, + "gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 }, + "id": 1, + "options": { + "colorMode": "background", + "graphMode": "none", + "reduceOptions": { "calcs": ["lastNotNull"] } + }, + "title": "Validator Status", + "type": "stat", + "targets": [ + { + "expr": "up{job=\"reasonforge-validator\"}", + "legendFormat": "Validator" + } + ] + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": 0 }, + { "color": "green", "value": 1 } + ] + } + } + }, + "gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 }, + "id": 2, + "options": { + "colorMode": "background", + "graphMode": "none", + "reduceOptions": { "calcs": ["lastNotNull"] } + }, + "title": "Miner Status", + "type": "stat", + "targets": [ + { + "expr": "up{job=\"reasonforge-miner\"}", + "legendFormat": "Miner" + } + ] + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": 0 }, + { "color": "green", "value": 1 } + ] + } + } + }, + "gridPos": { "h": 4, "w": 6, "x": 12, "y": 0 }, + "id": 3, + "options": { + "colorMode": "background", + "graphMode": "none", + "reduceOptions": { "calcs": ["lastNotNull"] } + }, + "title": "Gateway Status", + "type": "stat", + "targets": [ + { + "expr": "up{job=\"reasonforge-gateway\"}", + "legendFormat": "Gateway" + } + ] + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 }, + "id": 4, + "title": "Request Rate", + "type": "timeseries", + "targets": [ + { + "expr": "rate(http_requests_total[5m])", + "legendFormat": "{{job}} - {{status}}" + } + ] + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 }, + "id": 5, + "title": "Request Latency (p95)", + "type": "timeseries", + "targets": [ + { + "expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))", + "legendFormat": "{{job}} p95" + } + ] + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 12 }, + "id": 6, + "title": "Validator Average Score", + "type": "timeseries", + "targets": [ + { + "expr": "validator_average_score", + "legendFormat": "Average Score" + } + ] + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 12 }, + "id": 7, + "title": "Memory Usage", + "type": "timeseries", + "fieldConfig": { + "defaults": { + "unit": "bytes" + } + }, + "targets": [ + { + "expr": "process_resident_memory_bytes", + "legendFormat": "{{job}}" + } + ] + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 20 }, + "id": 8, + "title": "Error Rate", + "type": "timeseries", + "targets": [ + { + "expr": "rate(http_requests_total{status=~\"5..\"}[5m])", + "legendFormat": "{{job}} 5xx errors" + } + ] + } + ], + "schemaVersion": 39, + "tags": ["reasonforge", "subnet", "overview"], + "templating": { "list": [] }, + "time": { "from": "now-6h", "to": "now" }, + "timepicker": {}, + "timezone": "browser", + "title": "ReasonForge - Subnet Overview", + "uid": "reasonforge-subnet-overview", + "version": 1 +} diff --git a/monitoring/prometheus.yml b/monitoring/prometheus.yml new file mode 100644 index 0000000..eb0c7fa --- /dev/null +++ b/monitoring/prometheus.yml @@ -0,0 +1,30 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + scrape_timeout: 10s + +rule_files: + - "alerts/*.yml" + +scrape_configs: + - job_name: "prometheus" + static_configs: + - targets: ["localhost:9090"] + + - job_name: "reasonforge-validator" + static_configs: + - targets: ["validator:9092"] + metrics_path: /metrics + scrape_interval: 10s + + - job_name: "reasonforge-miner" + static_configs: + - targets: ["miner:9091"] + metrics_path: /metrics + scrape_interval: 10s + + - job_name: "reasonforge-gateway" + static_configs: + - targets: ["gateway:8000"] + metrics_path: /metrics + scrape_interval: 15s diff --git a/neurons/miner.py b/neurons/miner.py new file mode 100644 index 0000000..b6b49ee --- /dev/null +++ b/neurons/miner.py @@ -0,0 +1,258 @@ +""" +ReasonForge - Miner Neuron Entry Point + +Registers an Axon server, attaches handlers for each Synapse type, +and serves continuously responding to validator queries. +""" + +from __future__ import annotations + +import asyncio +import logging +import time +import traceback +from typing import Tuple + +# Conditional bittensor import +try: + import bittensor as bt + + HAS_BITTENSOR = True +except ImportError: + HAS_BITTENSOR = False + +from reasonforge.base.config import MinerConfig +from reasonforge.base.neuron import BaseNeuron +from reasonforge.miner.reasoning import ReasoningEngine +from reasonforge.protocol import HealthCheck, ReasoningTask, TaskResult + +logger = logging.getLogger("reasonforge.miner") + + +class ReasonForgeMiner(BaseNeuron): + """Production miner neuron for the ReasonForge subnet.""" + + neuron_type = "miner" + + def __init__(self, config=None): + super().__init__(config) + + # Parse miner-specific config + self.miner_config = MinerConfig( + backend=getattr(self.config, "miner.backend", "openai"), + model=getattr(self.config, "miner.model", "gpt-4o"), + api_key_env=getattr(self.config, "miner.api_key_env", "OPENAI_API_KEY"), + max_concurrent=getattr(self.config, "miner.max_concurrent", 4), + port=getattr(self.config, "miner.port", 8091), + domains=getattr( + self.config, + "miner.domains", + ["mathematics", "code", "scientific", "strategic", "causal", "ethical"], + ), + ) + + # Initialize reasoning engine + self.reasoning_engine = ReasoningEngine( + backend=self.miner_config.backend, + model=self.miner_config.model, + domains=self.miner_config.domains, + api_key=self.miner_config.api_key, + ) + + # Semaphore for concurrency control + self._semaphore = asyncio.Semaphore(self.miner_config.max_concurrent) + + # Task tracking + self._tasks_processed = 0 + self._last_task_time = 0.0 + + # Setup Axon if bittensor is available + self.axon = None + if HAS_BITTENSOR and self.wallet: + self.axon = bt.axon(wallet=self.wallet, port=self.miner_config.port) + self.axon.attach( + forward_fn=self.handle_reasoning_task, + blacklist_fn=self.blacklist_reasoning_task, + priority_fn=self.priority_reasoning_task, + ).attach( + forward_fn=self.handle_health_check, + ).attach( + forward_fn=self.handle_task_result, + ) + + async def handle_reasoning_task(self, synapse: ReasoningTask) -> ReasoningTask: + """Core handler: receive task, produce reasoning chain, return.""" + start_time = time.time_ns() + + async with self._semaphore: + try: + logger.info( + "Processing task %s (domain=%s, difficulty=%d)", + synapse.task_id, + synapse.domain, + synapse.difficulty, + ) + + # Execute reasoning + result = await self.reasoning_engine.solve( + problem=synapse.problem, + domain=synapse.domain, + difficulty=synapse.difficulty, + context=synapse.context, + constraints=synapse.constraints, + timeout=synapse.timeout_seconds, + ) + + # Fill mutable Synapse fields + synapse.reasoning_steps = [ + { + "step_id": i, + "reasoning": step.reasoning, + "evidence": step.evidence, + "confidence": step.confidence, + "formal_proof_fragment": step.formal_proof_fragment, + } + for i, step in enumerate(result.steps) + ] + synapse.final_answer = result.final_answer + synapse.proof_status = result.proof_status + synapse.proof_artifact = result.proof_artifact + synapse.code_artifact = result.code_artifact + synapse.time_taken_ms = int((time.time_ns() - start_time) / 1_000_000) + synapse.submission_hash = synapse.compute_submission_hash() + + self._tasks_processed += 1 + self._last_task_time = time.time() + + logger.info( + "Task %s completed in %dms (%d steps)", + synapse.task_id, + synapse.time_taken_ms, + len(synapse.reasoning_steps), + ) + + except Exception as e: + logger.error("Task %s failed: %s", synapse.task_id, e) + traceback.print_exc() + synapse.final_answer = f"ERROR: {str(e)}" + synapse.reasoning_steps = [] + synapse.time_taken_ms = int((time.time_ns() - start_time) / 1_000_000) + + return synapse + + def blacklist_reasoning_task(self, synapse: ReasoningTask) -> Tuple[bool, str]: + """Reject requests from non-validators or unregistered neurons.""" + if not HAS_BITTENSOR or self.metagraph is None: + return False, "" + + try: + caller_hotkey = synapse.dendrite.hotkey + if caller_hotkey not in self.metagraph.hotkeys: + return True, "Unregistered hotkey" + caller_uid = self.metagraph.hotkeys.index(caller_hotkey) + if not self.metagraph.validator_permit[caller_uid]: + return True, "No validator permit" + except Exception as e: + logger.warning("Blacklist check failed: %s", e) + return False, "" + + return False, "" + + def priority_reasoning_task(self, synapse: ReasoningTask) -> float: + """Higher-stake validators get priority.""" + if not HAS_BITTENSOR or self.metagraph is None: + return 0.0 + + try: + caller_hotkey = synapse.dendrite.hotkey + caller_uid = self.metagraph.hotkeys.index(caller_hotkey) + return float(self.metagraph.S[caller_uid]) + except Exception: + return 0.0 + + async def handle_health_check(self, synapse: HealthCheck) -> HealthCheck: + """Respond to health checks from validators.""" + synapse.status = "ready" + synapse.supported_domains = self.miner_config.domains + synapse.model_info = f"{self.miner_config.backend}:{self.miner_config.model}" + synapse.version = "0.1.0" + return synapse + + async def handle_task_result(self, synapse: TaskResult) -> TaskResult: + """Receive score notifications from validators (informational).""" + if synapse.scores: + logger.info( + "Epoch %d results: S_epoch=%.4f, Rank=%s", + synapse.epoch_id, + synapse.s_epoch or 0.0, + synapse.rank or "?", + ) + return synapse + + def get_state_dict(self) -> dict: + return { + "tasks_processed": self._tasks_processed, + "last_task_time": self._last_task_time, + "step": self.step, + } + + def restore_state_dict(self, state: dict) -> None: + self._tasks_processed = state.get("tasks_processed", 0) + self._last_task_time = state.get("last_task_time", 0.0) + self.step = state.get("step", 0) + + def run(self) -> None: + """Main miner loop.""" + uid_str = self.uid if self.uid is not None else "offline" + logger.info("ReasonForge Miner starting (UID=%s)", uid_str) + + if self.axon and HAS_BITTENSOR: + self.axon.serve( + netuid=self.config.netuid, + subtensor=self.subtensor, + ) + self.axon.start() + logger.info("Axon server started on port %d", self.miner_config.port) + + self.is_running = True + + try: + while self.is_running: + try: + if self.should_sync_metagraph(): + self.sync() + self.uid = self._get_uid() + + self.step += 1 + + # Save state periodically + if self.step % 100 == 0: + self.save_state() + + time.sleep(12) # One block + + except KeyboardInterrupt: + break + except Exception as e: + logger.error("Miner loop error: %s", e) + traceback.print_exc() + time.sleep(12) + finally: + if self.axon: + self.axon.stop() + self.save_state() + logger.info("Miner stopped.") + + +def main(): + """Entry point for miner neuron.""" + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s | %(name)s | %(levelname)s | %(message)s", + ) + miner = ReasonForgeMiner() + miner.run() + + +if __name__ == "__main__": + main() diff --git a/neurons/validator.py b/neurons/validator.py new file mode 100644 index 0000000..3994bd4 --- /dev/null +++ b/neurons/validator.py @@ -0,0 +1,429 @@ +""" +ReasonForge - Validator Neuron Entry Point + +Runs the main epoch loop: generate tasks, query miners, score responses, +compute and set on-chain weights. +""" + +from __future__ import annotations + +import asyncio +import logging +import time +import traceback +from collections import defaultdict +from dataclasses import dataclass, field +from typing import Dict, List, Tuple + +try: + import torch + + HAS_TORCH = True +except ImportError: + HAS_TORCH = False + +try: + import bittensor as bt + + HAS_BITTENSOR = True +except ImportError: + HAS_BITTENSOR = False + +from reasonforge.base.config import ValidatorConfig +from reasonforge.base.neuron import BaseNeuron +from reasonforge.engine import ScoringEngine +from reasonforge.protocol import ReasoningTask, verify_submission_hash +from reasonforge.types import ( + PEB_K, + SIMILARITY_PENALTY, + SIMILARITY_THRESHOLD, + DimensionScores, + MinerState, + Task, +) +from reasonforge.validator.scoring import ValidatorScorer +from reasonforge.validator.task_manager import TaskManager +from reasonforge.validator.trap_manager import TrapManager +from reasonforge.validator.weight_setter import WeightSetter + +logger = logging.getLogger("reasonforge.validator") + + +@dataclass +class TaskProcessingResult: + """Result of processing a single task across all miners.""" + + task: Task + scored_results: List[Tuple[int, DimensionScores]] = field(default_factory=list) + + +class ReasonForgeValidator(BaseNeuron): + """Production validator neuron for the ReasonForge subnet.""" + + neuron_type = "validator" + + def __init__(self, config=None): + super().__init__(config) + + # Parse validator-specific config + self.val_config = ValidatorConfig( + epoch_length=getattr(self.config, "validator.epoch_length", 360), + tasks_per_epoch=getattr(self.config, "validator.tasks_per_epoch", 12), + trap_rate=getattr(self.config, "validator.trap_rate", 0.15), + timeout=getattr(self.config, "validator.timeout", 300), + sample_size=getattr(self.config, "validator.sample_size", 16), + sandbox_enabled=getattr(self.config, "validator.sandbox_enabled", False), + lean4_enabled=getattr(self.config, "validator.lean4_enabled", False), + embedding_model=getattr(self.config, "validator.embedding_model", "all-MiniLM-L6-v2"), + ) + + # Initialize components + self.dendrite = None + if HAS_BITTENSOR and self.wallet: + self.dendrite = bt.dendrite(wallet=self.wallet) + + self.task_manager = TaskManager() + self.trap_manager = TrapManager(trap_rate=self.val_config.trap_rate) + self.scorer = ValidatorScorer( + lean4_enabled=self.val_config.lean4_enabled, + sandbox_enabled=self.val_config.sandbox_enabled, + ) + self.weight_setter = WeightSetter( + subtensor=self.subtensor, + wallet=self.wallet, + config=self.config, + ) + + # Similarity detector (lazy-loaded) + self.similarity_detector = None + + # State tracking + self.miner_states: Dict[int, MinerState] = {} + self.epoch_id: int = 0 + self.last_epoch_block: int = 0 + self.scores = None + if HAS_TORCH: + self.scores = torch.zeros(256) + + def _get_similarity_detector(self): + """Lazy-load similarity detector to avoid import cost at startup.""" + if self.similarity_detector is None: + try: + from reasonforge.embeddings.similarity import SimilarityDetector + + self.similarity_detector = SimilarityDetector( + model_name=self.val_config.embedding_model + ) + except ImportError: + logger.warning("Embedding similarity not available, using basic detection") + return self.similarity_detector + + def is_epoch_boundary(self, current_block: int) -> bool: + """Check if we've reached the next epoch boundary.""" + return (current_block - self.last_epoch_block) >= self.val_config.epoch_length + + def get_queryable_miners(self) -> List[int]: + """Get list of miner UIDs to query.""" + if not HAS_BITTENSOR or self.metagraph is None: + return list(range(min(10, 256))) # Test mode + + miner_uids = [] + for uid in range(self.metagraph.n): + if uid == self.uid: + continue # Skip self + # Check if axon is serving + axon = self.metagraph.axons[uid] + if axon.ip != "0.0.0.0" and axon.port > 0: + miner_uids.append(uid) + + # Sample if too many + if len(miner_uids) > self.val_config.sample_size: + import random + + miner_uids = random.sample(miner_uids, self.val_config.sample_size) + + return miner_uids + + def get_or_create_miner_state(self, uid: int) -> MinerState: + """Get or create miner state for a UID.""" + if uid not in self.miner_states: + self.miner_states[uid] = MinerState(miner_id=str(uid)) + return self.miner_states[uid] + + def run(self) -> None: + """Main validator loop.""" + uid_str = self.uid if self.uid is not None else "offline" + logger.info("ReasonForge Validator starting (UID=%s)", uid_str) + + self.is_running = True + + try: + while self.is_running: + try: + # 1. Sync metagraph + if self.should_sync_metagraph(): + self.sync() + + # 2. Check if epoch boundary + if HAS_BITTENSOR and self.subtensor: + current_block = self.subtensor.get_current_block() + if self.is_epoch_boundary(current_block): + self._run_epoch() + self.last_epoch_block = current_block + else: + # Offline mode: run epoch every 60 seconds + self._run_epoch() + time.sleep(60) + continue + + # 3. Sleep for one block + time.sleep(12) + + except KeyboardInterrupt: + break + except Exception as e: + logger.error("Validator loop error: %s", e) + traceback.print_exc() + time.sleep(12) + finally: + self.save_state() + logger.info("Validator stopped.") + + def _run_epoch(self) -> None: + """Execute one complete scoring epoch.""" + self.epoch_id += 1 + logger.info("=== EPOCH %d ===", self.epoch_id) + + # Phase A: Generate tasks + tasks = self.task_manager.generate_epoch_tasks( + count=self.val_config.tasks_per_epoch, + trap_rate=self.val_config.trap_rate, + ) + logger.info("Generated %d tasks (%d traps)", len(tasks), sum(1 for t in tasks if t.is_trap)) + + # Phase B: Process each task + all_task_results = [] + for task in tasks: + result = asyncio.get_event_loop().run_until_complete(self._process_task(task)) + all_task_results.append(result) + + # Phase C: Compute epoch scores + self._compute_epoch_scores(all_task_results) + + # Phase D: Set on-chain weights + self._set_weights() + + # Phase E: Persist state + self.save_state() + + # Phase F: Log results + self._log_epoch_results() + + async def _process_task(self, task: Task) -> TaskProcessingResult: + """Query miners, collect responses, score them.""" + miner_uids = self.get_queryable_miners() + + if not miner_uids: + logger.warning("No queryable miners found") + return TaskProcessingResult(task=task) + + # Build synapse + from reasonforge.protocol import create_reasoning_task + + synapse = create_reasoning_task( + task_id=task.task_id, + problem=task.problem, + domain=task.domain.value if hasattr(task.domain, "value") else task.domain, + difficulty=task.difficulty, + timeout_seconds=self.val_config.timeout, + ) + + # Query miners + responses = [] + if self.dendrite and HAS_BITTENSOR and self.metagraph: + axons = [self.metagraph.axons[uid] for uid in miner_uids] + responses = await self.dendrite( + axons=axons, + synapse=synapse, + timeout=self.val_config.timeout, + ) + else: + # Test mode: empty responses + responses = [ReasoningTask(**synapse.model_dump()) for _ in miner_uids] + + # Score each response + scored_results = [] + for uid, response in zip(miner_uids, responses): + try: + # Check for timeout/failure + if response.final_answer is None: + scored_results.append((uid, DimensionScores(0, 0, 0, 0))) + continue + + # Verify submission hash + if response.submission_hash and not verify_submission_hash(response): + logger.warning("UID %d: hash mismatch, penalizing", uid) + scored_results.append((uid, DimensionScores(0, 0, 0, 0))) + continue + + # Compute dimension scores + response_data = response.deserialize() + dim_scores = await self.scorer.compute_dimensions(task, response_data) + + # Check plagiarism + sim_detector = self._get_similarity_detector() + if sim_detector: + try: + similarity = sim_detector.check_against_batch( + response, [r for r in responses if r != response] + ) + if similarity > SIMILARITY_THRESHOLD: + dim_scores = DimensionScores( + quality=dim_scores.quality * SIMILARITY_PENALTY, + accuracy=dim_scores.accuracy * SIMILARITY_PENALTY, + novelty=dim_scores.novelty * SIMILARITY_PENALTY, + efficiency=dim_scores.efficiency, + ) + except Exception as e: + logger.debug("Similarity check failed: %s", e) + + scored_results.append((uid, dim_scores)) + + # Track trap scores + if task.is_trap: + trap_score = self.trap_manager.evaluate_trap_response( + task, response.final_answer, response.reasoning_steps + ) + self.trap_manager.record_trap_score(uid, trap_score) + + except Exception as e: + logger.warning("Scoring UID %d failed: %s", uid, e) + scored_results.append((uid, DimensionScores(0, 0, 0, 0))) + + return TaskProcessingResult(task=task, scored_results=scored_results) + + def _compute_epoch_scores(self, task_results: List[TaskProcessingResult]) -> None: + """Aggregate per-task CMS into S_epoch using MVP engine.""" + # Collect per-miner scores + miner_task_scores: Dict[int, List[Tuple[float, float]]] = defaultdict(list) + + for tr in task_results: + for uid, dim_scores in tr.scored_results: + cms = ScoringEngine.compute_cms(dim_scores) + diff_mult = tr.task.difficulty_multiplier + miner_task_scores[uid].append((cms, diff_mult)) + + # Compute S_epoch for each miner + for uid, scores in miner_task_scores.items(): + ms = self.get_or_create_miner_state(uid) + cms_list = [s[0] for s in scores] + diff_mults = [s[1] for s in scores] + + trap_penalty = self.trap_manager.get_trap_penalty(uid) + + ms.s_epoch = ScoringEngine.compute_s_epoch(cms_list, diff_mults, trap_penalty) + ms.epoch_scores.append(ms.s_epoch) + ms.task_count += len(cms_list) + + # Rank miners + ranked = sorted( + [(uid, ms) for uid, ms in self.miner_states.items() if ms.s_epoch > 0], + key=lambda x: x[1].s_epoch, + reverse=True, + ) + + for i, (uid, ms) in enumerate(ranked): + ms.rank = i + 1 + if ms.rank <= PEB_K: + ms.streak += 1 + else: + ms.streak = 0 + ms.peb = ScoringEngine.compute_peb(ms.rank, ms.streak) + + def _set_weights(self) -> None: + """Compute and set on-chain weights.""" + n = 256 + if HAS_BITTENSOR and self.metagraph: + n = self.metagraph.n + + miner_data = {} + for uid, ms in self.miner_states.items(): + miner_data[uid] = {"s_epoch": ms.s_epoch, "peb": ms.peb} + + uids, weights = self.weight_setter.compute_weights(miner_data, n) + + if HAS_BITTENSOR and self.subtensor: + netuid = getattr(self.config, "netuid", 1) + success = self.weight_setter.submit(uids, weights, netuid) + if success: + logger.info("Weights set for epoch %d", self.epoch_id) + else: + logger.error("Failed to set weights for epoch %d", self.epoch_id) + else: + logger.info( + "Weights computed (offline mode): %d non-zero entries", + len(uids) if hasattr(uids, "__len__") else 0, + ) + + def _log_epoch_results(self) -> None: + """Log epoch results summary.""" + active = [(uid, ms) for uid, ms in self.miner_states.items() if ms.s_epoch > 0] + if not active: + logger.info("Epoch %d: No active miners", self.epoch_id) + return + + active.sort(key=lambda x: x[1].s_epoch, reverse=True) + logger.info("Epoch %d results (%d active miners):", self.epoch_id, len(active)) + for uid, ms in active[:10]: + logger.info( + " UID %d: S_epoch=%.4f, PEB=%.4f, Rank=%d, Streak=%d", + uid, + ms.s_epoch, + ms.peb, + ms.rank, + ms.streak, + ) + + def get_state_dict(self) -> dict: + return { + "epoch_id": self.epoch_id, + "last_epoch_block": self.last_epoch_block, + "miner_states": { + uid: { + "s_epoch": ms.s_epoch, + "peb": ms.peb, + "rank": ms.rank, + "streak": ms.streak, + "total_tao_earned": ms.total_tao_earned, + "task_count": ms.task_count, + } + for uid, ms in self.miner_states.items() + }, + } + + def restore_state_dict(self, state: dict) -> None: + self.epoch_id = state.get("epoch_id", 0) + self.last_epoch_block = state.get("last_epoch_block", 0) + for uid_str, ms_data in state.get("miner_states", {}).items(): + uid = int(uid_str) + ms = self.get_or_create_miner_state(uid) + ms.s_epoch = ms_data.get("s_epoch", 0.0) + ms.peb = ms_data.get("peb", 0.0) + ms.rank = ms_data.get("rank", 0) + ms.streak = ms_data.get("streak", 0) + ms.total_tao_earned = ms_data.get("total_tao_earned", 0.0) + ms.task_count = ms_data.get("task_count", 0) + + +def main(): + """Entry point for validator neuron.""" + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s | %(name)s | %(levelname)s | %(message)s", + ) + validator = ReasonForgeValidator() + validator.run() + + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml index d5c9f0f..888ea06 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,14 +5,69 @@ build-backend = "hatchling.build" [project] name = "reasonforge" version = "0.1.0" -description = "The Decentralized Marketplace for Verifiable Intelligence" +description = "The Decentralized Marketplace for Verifiable Intelligence - Bittensor Subnet" requires-python = ">=3.9" -dependencies = [] +dependencies = [ + "pydantic>=2.0.0", + "numpy>=1.24.0", +] [project.optional-dependencies] api = ["fastapi>=0.100.0", "uvicorn>=0.23.0"] -dev = ["pytest>=7.0.0"] -all = ["fastapi>=0.100.0", "uvicorn>=0.23.0", "pytest>=7.0.0"] +miner = [ + "openai>=1.0.0", + "anthropic>=0.20.0", +] +validator = [ + "sympy>=1.12", + "prometheus-client>=0.19.0", + "structlog>=23.0.0", +] +embeddings = ["sentence-transformers>=2.2.0"] +gateway = [ + "fastapi>=0.100.0", + "uvicorn>=0.23.0", + "python-jose>=3.3.0", + "passlib>=1.7.4", +] +dev = [ + "pytest>=7.0.0", + "pytest-asyncio>=0.21.0", + "pytest-timeout>=2.1.0", + "ruff>=0.1.0", + "mypy>=1.6.0", + "httpx>=0.24.0", +] +all = [ + "fastapi>=0.100.0", + "uvicorn>=0.23.0", + "pytest>=7.0.0", + "pytest-asyncio>=0.21.0", + "openai>=1.0.0", + "anthropic>=0.20.0", + "sympy>=1.12", + "prometheus-client>=0.19.0", + "structlog>=23.0.0", + "sentence-transformers>=2.2.0", + "python-jose>=3.3.0", + "httpx>=0.24.0", +] [project.scripts] reasonforge = "reasonforge.run:main" + +[tool.pytest.ini_options] +testpaths = ["tests"] +asyncio_mode = "auto" + +[tool.ruff] +target-version = "py310" +line-length = 100 + +[tool.ruff.lint] +select = ["E", "F", "I", "W"] +ignore = ["E501"] + +[tool.mypy] +python_version = "3.10" +ignore_missing_imports = true diff --git a/reasonforge/__init__.py b/reasonforge/__init__.py index 5490b4a..8cfac4c 100644 --- a/reasonforge/__init__.py +++ b/reasonforge/__init__.py @@ -7,29 +7,28 @@ __version__ = "0.1.0" -from .types import ( - Domain, - TaskSource, - Task, - ReasoningStep, - MinerSubmission, - DimensionScores, - ValidatorScore, - MinerState, - ValidatorState, - EpochResult, -) - from .engine import ScoringEngine +from .plagiarism import PlagiarismDetector from .simulator import ( + EpochSimulator, MinerProfile, ValidatorProfile, - EpochSimulator, create_default_miners, create_default_validators, ) from .task_generator import TaskGenerator -from .plagiarism import PlagiarismDetector +from .types import ( + DimensionScores, + Domain, + EpochResult, + MinerState, + MinerSubmission, + ReasoningStep, + Task, + TaskSource, + ValidatorScore, + ValidatorState, +) __all__ = [ "Domain", diff --git a/reasonforge/__main__.py b/reasonforge/__main__.py index a6f5b2d..7c9b71c 100644 --- a/reasonforge/__main__.py +++ b/reasonforge/__main__.py @@ -1,4 +1,5 @@ """Allow running as: python -m reasonforge""" + from .run import main main() diff --git a/reasonforge/base/__init__.py b/reasonforge/base/__init__.py new file mode 100644 index 0000000..15d30da --- /dev/null +++ b/reasonforge/base/__init__.py @@ -0,0 +1,12 @@ +""" +ReasonForge - Base Neuron Infrastructure + +Provides shared infrastructure for miners and validators: +- BaseNeuron: wallet, subtensor, metagraph management +- MinerConfig / ValidatorConfig: CLI argument parsing +""" + +from .config import MinerConfig, ValidatorConfig +from .neuron import BaseNeuron + +__all__ = ["BaseNeuron", "MinerConfig", "ValidatorConfig"] diff --git a/reasonforge/base/config.py b/reasonforge/base/config.py new file mode 100644 index 0000000..43ae1d1 --- /dev/null +++ b/reasonforge/base/config.py @@ -0,0 +1,167 @@ +""" +ReasonForge - Configuration Management + +CLI argument parsing for miner and validator neurons. +Gracefully degrades when bittensor is not installed. +""" + +from __future__ import annotations + +import argparse +import os +from dataclasses import dataclass, field +from typing import List, Optional + +try: + import bittensor as bt + + HAS_BITTENSOR = True +except ImportError: + HAS_BITTENSOR = False + + +@dataclass +class MinerConfig: + """Miner-specific configuration.""" + + backend: str = "openai" + model: str = "gpt-4o" + api_key_env: str = "OPENAI_API_KEY" + max_concurrent: int = 4 + port: int = 8091 + domains: List[str] = field( + default_factory=lambda: [ + "mathematics", + "code", + "scientific", + "strategic", + "causal", + "ethical", + ] + ) + + @staticmethod + def add_args(parser: argparse.ArgumentParser) -> None: + group = parser.add_argument_group("Miner") + group.add_argument( + "--miner.backend", + type=str, + default="openai", + choices=["openai", "anthropic", "local", "agent"], + help="LLM backend to use", + ) + group.add_argument("--miner.model", type=str, default="gpt-4o") + group.add_argument("--miner.api_key_env", type=str, default="OPENAI_API_KEY") + group.add_argument("--miner.max_concurrent", type=int, default=4) + group.add_argument("--miner.port", type=int, default=8091) + group.add_argument( + "--miner.domains", + type=str, + nargs="+", + default=["mathematics", "code", "scientific", "strategic", "causal", "ethical"], + ) + + @classmethod + def from_args(cls, args: argparse.Namespace) -> MinerConfig: + return cls( + backend=getattr(args, "miner.backend", "openai"), + model=getattr(args, "miner.model", "gpt-4o"), + api_key_env=getattr(args, "miner.api_key_env", "OPENAI_API_KEY"), + max_concurrent=getattr(args, "miner.max_concurrent", 4), + port=getattr(args, "miner.port", 8091), + domains=getattr(args, "miner.domains", cls.domains), + ) + + @property + def api_key(self) -> Optional[str]: + return os.environ.get(self.api_key_env) + + +@dataclass +class ValidatorConfig: + """Validator-specific configuration.""" + + epoch_length: int = 360 + tasks_per_epoch: int = 12 + trap_rate: float = 0.15 + timeout: int = 300 + sample_size: int = 16 + port: int = 8092 + sandbox_enabled: bool = False + lean4_enabled: bool = False + embedding_model: str = "all-MiniLM-L6-v2" + + @staticmethod + def add_args(parser: argparse.ArgumentParser) -> None: + group = parser.add_argument_group("Validator") + group.add_argument( + "--validator.epoch_length", + type=int, + default=360, + help="Blocks per epoch (360 = ~72 min)", + ) + group.add_argument("--validator.tasks_per_epoch", type=int, default=12) + group.add_argument("--validator.trap_rate", type=float, default=0.15) + group.add_argument("--validator.timeout", type=int, default=300) + group.add_argument( + "--validator.sample_size", + type=int, + default=16, + help="Number of miners to query per task", + ) + group.add_argument("--validator.port", type=int, default=8092) + group.add_argument("--validator.sandbox_enabled", action="store_true") + group.add_argument("--validator.lean4_enabled", action="store_true") + group.add_argument( + "--validator.embedding_model", + type=str, + default="all-MiniLM-L6-v2", + ) + + @classmethod + def from_args(cls, args: argparse.Namespace) -> ValidatorConfig: + return cls( + epoch_length=getattr(args, "validator.epoch_length", 360), + tasks_per_epoch=getattr(args, "validator.tasks_per_epoch", 12), + trap_rate=getattr(args, "validator.trap_rate", 0.15), + timeout=getattr(args, "validator.timeout", 300), + sample_size=getattr(args, "validator.sample_size", 16), + port=getattr(args, "validator.port", 8092), + sandbox_enabled=getattr(args, "validator.sandbox_enabled", False), + lean4_enabled=getattr(args, "validator.lean4_enabled", False), + embedding_model=getattr(args, "validator.embedding_model", "all-MiniLM-L6-v2"), + ) + + +def create_parser(neuron_type: str = "miner") -> argparse.ArgumentParser: + """Create CLI argument parser with common + neuron-specific args.""" + parser = argparse.ArgumentParser( + description=f"ReasonForge {neuron_type.title()} Neuron", + ) + + # Common args + parser.add_argument("--netuid", type=int, required=True, help="Subnet UID") + parser.add_argument( + "--subtensor.network", + type=str, + default="finney", + help="Bittensor network (finney|test|local)", + ) + parser.add_argument("--subtensor.chain_endpoint", type=str, default=None) + parser.add_argument("--logging.debug", action="store_true") + parser.add_argument("--wallet.name", type=str, default="default") + parser.add_argument("--wallet.hotkey", type=str, default="default") + + # Add bittensor native args if available + if HAS_BITTENSOR: + bt.Wallet.add_args(parser) + bt.Subtensor.add_args(parser) + bt.logging.add_args(parser) + + # Add neuron-specific args + if neuron_type == "miner": + MinerConfig.add_args(parser) + elif neuron_type == "validator": + ValidatorConfig.add_args(parser) + + return parser diff --git a/reasonforge/base/neuron.py b/reasonforge/base/neuron.py new file mode 100644 index 0000000..5fc88e4 --- /dev/null +++ b/reasonforge/base/neuron.py @@ -0,0 +1,159 @@ +""" +ReasonForge - Base Neuron Class + +Shared infrastructure for miners and validators. +Handles wallet, subtensor, metagraph, registration, state persistence. +""" + +from __future__ import annotations + +import logging +import os +from abc import ABC, abstractmethod +from typing import Optional + +try: + import bittensor as bt + + HAS_BITTENSOR = True +except ImportError: + HAS_BITTENSOR = False + +from .config import create_parser + +logger = logging.getLogger("reasonforge") + + +class BaseNeuron(ABC): + """Shared infrastructure for miners and validators.""" + + neuron_type: str = "base" + + def __init__(self, config=None): + # 1. Parse CLI args / config + self.config = config or self.get_config() + + # 2. Initialize Bittensor objects + if HAS_BITTENSOR: + self.wallet = bt.Wallet(config=self.config) + self.subtensor = bt.Subtensor(config=self.config) + self.metagraph = self.subtensor.metagraph(netuid=self.config.netuid) + else: + self.wallet = None + self.subtensor = None + self.metagraph = None + logger.warning("Bittensor not installed. Running in offline/test mode.") + + # 3. Check registration + self.uid = self._get_uid() + if self.uid is None and HAS_BITTENSOR: + logger.error( + "Neuron not registered on subnet %s. Run: btcli register --netuid %s", + getattr(self.config, "netuid", "?"), + getattr(self.config, "netuid", "?"), + ) + + # 4. State tracking + self.last_sync_block: int = 0 + self.is_running: bool = False + self.step: int = 0 + + # 5. Initialize state persistence (deferred import to avoid circular) + self.state_db = None + self._init_state_db() + + # 6. Load previous state if exists + self.load_state() + + def _init_state_db(self) -> None: + """Initialize SQLite state database.""" + try: + from ..state.database import StateDatabase + + db_dir = os.path.join("state") + os.makedirs(db_dir, exist_ok=True) + uid_str = self.uid if self.uid is not None else "unregistered" + db_path = os.path.join(db_dir, f"{self.neuron_type}_{uid_str}.db") + self.state_db = StateDatabase(db_path) + except ImportError: + logger.debug("State module not available, running without persistence.") + self.state_db = None + + def get_config(self): + """Parse CLI arguments.""" + if HAS_BITTENSOR: + parser = create_parser(self.neuron_type) + config = bt.Config(parser) + return config + else: + parser = create_parser(self.neuron_type) + args = parser.parse_args() + return args + + def _get_uid(self) -> Optional[int]: + """Find our UID in the metagraph.""" + if not HAS_BITTENSOR or self.metagraph is None or self.wallet is None: + return None + try: + hotkey = self.wallet.hotkey.ss58_address + if hotkey in self.metagraph.hotkeys: + return self.metagraph.hotkeys.index(hotkey) + except Exception: + pass + return None + + def sync(self) -> None: + """Re-sync metagraph from chain.""" + if HAS_BITTENSOR and self.subtensor and self.metagraph: + self.metagraph.sync(subtensor=self.subtensor) + self.last_sync_block = self.subtensor.get_current_block() + logger.debug("Metagraph synced at block %d", self.last_sync_block) + + def should_sync_metagraph(self) -> bool: + """Sync every 5 blocks (~60 seconds).""" + if not HAS_BITTENSOR or not self.subtensor: + return False + try: + current_block = self.subtensor.get_current_block() + return (current_block - self.last_sync_block) >= 5 + except Exception: + return False + + def save_state(self) -> None: + """Persist neuron state to SQLite.""" + if self.state_db is not None: + try: + state_dict = self.get_state_dict() + self.state_db.save_checkpoint(state_dict) + except Exception as e: + logger.warning("Failed to save state: %s", e) + + def load_state(self) -> None: + """Restore from last checkpoint.""" + if self.state_db is not None: + try: + state = self.state_db.load_latest_checkpoint() + if state: + self.restore_state_dict(state) + logger.info("State restored from checkpoint.") + except Exception as e: + logger.debug("No state to restore: %s", e) + + @abstractmethod + def get_state_dict(self) -> dict: + """Serialize neuron state for persistence.""" + ... + + @abstractmethod + def restore_state_dict(self, state: dict) -> None: + """Restore neuron state from checkpoint.""" + ... + + @abstractmethod + def run(self) -> None: + """Main neuron loop.""" + ... + + def stop(self) -> None: + """Signal the neuron to stop.""" + self.is_running = False diff --git a/reasonforge/embeddings/__init__.py b/reasonforge/embeddings/__init__.py new file mode 100644 index 0000000..cf15ce3 --- /dev/null +++ b/reasonforge/embeddings/__init__.py @@ -0,0 +1,9 @@ +""" +ReasonForge - Embedding-Based Similarity Detection + +Replaces MVP Jaccard similarity with sentence-transformer cosine similarity. +""" + +from .similarity import SimilarityDetector + +__all__ = ["SimilarityDetector"] diff --git a/reasonforge/embeddings/similarity.py b/reasonforge/embeddings/similarity.py new file mode 100644 index 0000000..16c54cb --- /dev/null +++ b/reasonforge/embeddings/similarity.py @@ -0,0 +1,129 @@ +""" +ReasonForge - Embedding Similarity Detector + +Detect plagiarism between miner submissions using sentence embeddings. +Uses sentence-transformers/all-MiniLM-L6-v2 (fast, 384-dim). +""" + +from __future__ import annotations + +import logging +from typing import List + +import numpy as np + +logger = logging.getLogger("reasonforge.embeddings") + + +class SimilarityDetector: + """ + Detect plagiarism between miner submissions using sentence embeddings. + Uses cosine similarity on normalized embedding vectors. + """ + + def __init__(self, model_name: str = "all-MiniLM-L6-v2"): + self.model_name = model_name + self._model = None + self.history_embeddings: List[np.ndarray] = [] + self.max_history = 5000 + + def _get_model(self): + """Lazy-load the sentence-transformer model.""" + if self._model is None: + try: + from sentence_transformers import SentenceTransformer + + self._model = SentenceTransformer(self.model_name) + logger.info("Loaded embedding model: %s", self.model_name) + except ImportError: + raise ImportError( + "sentence-transformers not installed. " + "Install with: pip install sentence-transformers>=2.2.0" + ) + return self._model + + def embed_text(self, text: str) -> np.ndarray: + """Encode text into a normalized embedding vector.""" + model = self._get_model() + return model.encode(text, normalize_embeddings=True) + + def embed_submission(self, response) -> np.ndarray: + """Encode a ReasoningTask response into a single embedding vector.""" + steps = getattr(response, "reasoning_steps", None) or [] + if isinstance(steps, list): + steps_text = " ".join( + s.get("reasoning", "") if isinstance(s, dict) else str(s) for s in steps + ) + else: + steps_text = str(steps) + + final_answer = getattr(response, "final_answer", "") or "" + full_text = f"{steps_text} {final_answer}".strip() + + if not full_text: + return np.zeros(384) # Default embedding dimension + + return self.embed_text(full_text) + + def cosine_similarity(self, a: np.ndarray, b: np.ndarray) -> float: + """Compute cosine similarity between two vectors.""" + # Vectors are already normalized, so dot product = cosine similarity + return float(np.dot(a, b)) + + def check_against_batch(self, response, other_responses: list) -> float: + """ + Return max cosine similarity against other responses in this batch. + + Args: + response: The ReasoningTask response to check. + other_responses: List of other responses in the same batch. + + Returns: + Max similarity score (0.0 to 1.0). + """ + if not other_responses: + return 0.0 + + try: + target_emb = self.embed_submission(response) + other_embs = np.array([self.embed_submission(r) for r in other_responses]) + + # Cosine similarities (embeddings are normalized) + similarities = other_embs @ target_emb + return float(np.max(similarities)) + except Exception as e: + logger.warning("Batch similarity check failed: %s", e) + return 0.0 + + def check_against_history(self, response) -> float: + """Check against historical submissions (cross-epoch plagiarism).""" + if not self.history_embeddings: + return 0.0 + + try: + target_emb = self.embed_submission(response) + history_matrix = np.array(self.history_embeddings[-self.max_history :]) + similarities = history_matrix @ target_emb + return float(np.max(similarities)) + except Exception as e: + logger.warning("History similarity check failed: %s", e) + return 0.0 + + def add_to_history(self, response) -> None: + """Store embedding for future cross-epoch checks.""" + try: + emb = self.embed_submission(response) + self.history_embeddings.append(emb) + if len(self.history_embeddings) > self.max_history: + self.history_embeddings = self.history_embeddings[-self.max_history :] + except Exception as e: + logger.debug("Failed to add to history: %s", e) + + def check_text_similarity(self, text_a: str, text_b: str) -> float: + """Direct text-to-text similarity check.""" + try: + emb_a = self.embed_text(text_a) + emb_b = self.embed_text(text_b) + return self.cosine_similarity(emb_a, emb_b) + except Exception: + return 0.0 diff --git a/reasonforge/engine.py b/reasonforge/engine.py index c69c907..d31c051 100644 --- a/reasonforge/engine.py +++ b/reasonforge/engine.py @@ -82,9 +82,7 @@ def compute_peb(rank: int, streak: int) -> float: return PEB_ALPHA * (1.0 / rank) * math.sqrt(capped_streak) @staticmethod - def distribute_miner_emissions( - miners: List[MinerState], pool: float - ) -> List[float]: + def distribute_miner_emissions(miners: List[MinerState], pool: float) -> List[float]: """ Eq. 5 — Final Miner Reward R(m) = E_miner * [S_epoch(m) * (1 + PEB(m))] / sum_j[S_epoch(j) * (1 + PEB(j))] @@ -109,9 +107,7 @@ def apply_breakthrough(cms: float, is_breakthrough: bool) -> float: return cms @staticmethod - def compute_vas( - v_scores: List[float], consensus_scores: List[float] - ) -> float: + def compute_vas(v_scores: List[float], consensus_scores: List[float]) -> float: """ Eq. 7 — Validator Accuracy Score (VAS) VAS(v) = 1 - (1/|T_v|) * sum|score_v(m,t) - score_consensus(m,t)| @@ -131,9 +127,7 @@ def distribute_validator_emissions( R_v(v) = E_validator * [VAS(v) * stake(v) * rep_mult(v)] / sum_k[VAS(k) * stake(k) * rep_mult(k)] """ - weighted = [ - v.current_vas * v.stake * v.reputation_multiplier for v in validators - ] + weighted = [v.current_vas * v.stake * v.reputation_multiplier for v in validators] total_weight = sum(weighted) if total_weight <= 0: n = len(validators) @@ -165,9 +159,7 @@ def compute_slash(stake: float, vas_7d_avg: float) -> float: return VAS_SLASH_GAMMA * stake * (VAS_SLASH_THRESHOLD - vas_7d_avg) ** 2 @staticmethod - def compute_objective_score( - checks: Dict[str, float], weights: Dict[str, float] - ) -> float: + def compute_objective_score(checks: Dict[str, float], weights: Dict[str, float]) -> float: """ Eq. 11 — Objective Score O_score(m,t) = sum_k(omega_k * check_k) diff --git a/reasonforge/gateway/__init__.py b/reasonforge/gateway/__init__.py new file mode 100644 index 0000000..d467363 --- /dev/null +++ b/reasonforge/gateway/__init__.py @@ -0,0 +1,6 @@ +""" +ReasonForge - API Gateway + +External-facing FastAPI application for task submission, results querying, +and network statistics. +""" diff --git a/reasonforge/gateway/app.py b/reasonforge/gateway/app.py new file mode 100644 index 0000000..68c9902 --- /dev/null +++ b/reasonforge/gateway/app.py @@ -0,0 +1,229 @@ +""" +ReasonForge - API Gateway Application + +External-facing FastAPI application for users to submit tasks and query results. +""" + +from __future__ import annotations + +import time +import uuid +from typing import Optional + +from fastapi import Depends, FastAPI, Header, HTTPException, Request +from fastapi.middleware.cors import CORSMiddleware + +from .auth import APIKeyManager +from .billing import BillingTracker +from .rate_limiter import PerIPRateLimiter +from .schemas import ( + HealthResponse, + LeaderboardEntry, + LeaderboardResponse, + NetworkStatsResponse, + TaskResultResponse, + TaskSubmissionRequest, +) + +# ── App Setup ── + +app = FastAPI( + title="ReasonForge Gateway", + version="0.1.0", + description="External API for the ReasonForge Bittensor Subnet", +) + +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +# ── State (initialized on startup) ── + +_start_time = time.time() +_auth_manager = APIKeyManager() +_billing = BillingTracker() +_rate_limiter = PerIPRateLimiter(requests_per_minute=60) + +# Task queue (in production, this would be shared with the validator) +_task_queue: dict = {} +_task_results: dict = {} +_epoch_data: dict = {"epoch_id": 0, "miner_states": {}} + + +# ── Auth Dependency ── + + +async def verify_api_key(x_api_key: str = Header(None)) -> dict: + """Verify API key from request header.""" + if not x_api_key: + raise HTTPException(status_code=401, detail="Missing X-API-Key header") + info = _auth_manager.verify_key(x_api_key) + if not info: + raise HTTPException(status_code=403, detail="Invalid or expired API key") + return {"key": x_api_key, "info": info} + + +# ── Rate Limit Dependency ── + + +async def check_rate_limit(request: Request) -> None: + """Check per-IP rate limits.""" + client_ip = request.client.host if request.client else "unknown" + if not _rate_limiter.allow(client_ip): + raise HTTPException( + status_code=429, + detail="Rate limit exceeded. Try again later.", + ) + + +# ── Endpoints ── + + +@app.get("/v1/health", response_model=HealthResponse) +async def health_check(): + """Health check endpoint.""" + return HealthResponse( + status="healthy", + version="0.1.0", + uptime_seconds=time.time() - _start_time, + epoch=_epoch_data.get("epoch_id", 0), + db_connected=True, + ) + + +@app.post( + "/v1/tasks", + response_model=TaskResultResponse, + dependencies=[Depends(check_rate_limit)], +) +async def submit_task( + request: TaskSubmissionRequest, + auth: dict = Depends(verify_api_key), +): + """Submit a reasoning task to the network.""" + task_id = str(uuid.uuid4()) + + _task_queue[task_id] = { + "task_id": task_id, + "problem": request.problem, + "domain": request.domain, + "difficulty": request.difficulty, + "timeout_seconds": request.timeout_seconds, + "callback_url": request.callback_url, + "status": "queued", + "submitted_at": time.time(), + } + + # Track billing + _billing.record_usage( + key_id=auth["info"].key_id, + task_id=task_id, + domain=request.domain or "auto", + ) + _auth_manager.track_usage(auth["key"]) + + return TaskResultResponse(task_id=task_id, status="queued") + + +@app.get( + "/v1/tasks/{task_id}", + response_model=TaskResultResponse, + dependencies=[Depends(check_rate_limit)], +) +async def get_task_result(task_id: str): + """Poll for task results.""" + # Check task queue + if task_id in _task_queue: + task = _task_queue[task_id] + return TaskResultResponse( + task_id=task_id, + status=task["status"], + ) + + # Check completed results + if task_id in _task_results: + result = _task_results[task_id] + return TaskResultResponse( + task_id=task_id, + status="completed", + result=result.get("result"), + best_answer=result.get("best_answer"), + confidence=result.get("confidence"), + reasoning_steps=result.get("reasoning_steps"), + processing_time_ms=result.get("processing_time_ms"), + ) + + raise HTTPException(status_code=404, detail="Task not found") + + +@app.get( + "/v1/leaderboard", + response_model=LeaderboardResponse, + dependencies=[Depends(check_rate_limit)], +) +async def get_leaderboard(domain: Optional[str] = None, limit: int = 20): + """Get current miner rankings.""" + miners = _epoch_data.get("miner_states", {}) + entries = [] + + for uid_str, ms in miners.items(): + entries.append( + LeaderboardEntry( + uid=int(uid_str), + s_epoch=ms.get("s_epoch", 0.0), + peb=ms.get("peb", 0.0), + rank=ms.get("rank", 0), + streak=ms.get("streak", 0), + tasks_completed=ms.get("task_count", 0), + ) + ) + + entries.sort(key=lambda e: e.s_epoch, reverse=True) + entries = entries[:limit] + + return LeaderboardResponse( + epoch_id=_epoch_data.get("epoch_id", 0), + entries=entries, + total_miners=len(miners), + ) + + +@app.get( + "/v1/stats", + response_model=NetworkStatsResponse, + dependencies=[Depends(check_rate_limit)], +) +async def get_stats(): + """Get network statistics.""" + miners = _epoch_data.get("miner_states", {}) + avg_cms = 0.0 + if miners: + scores = [ms.get("s_epoch", 0) for ms in miners.values()] + avg_cms = sum(scores) / len(scores) if scores else 0.0 + + return NetworkStatsResponse( + current_epoch=_epoch_data.get("epoch_id", 0), + total_tasks_processed=len(_task_results), + active_miners=len(miners), + active_validators=1, + avg_cms=avg_cms, + total_emission_tao=0.0, + top_domains={}, + ) + + +def create_app(db=None, auth_manager=None) -> FastAPI: + """Factory for creating the gateway app with dependencies.""" + global _auth_manager, _billing + + if auth_manager: + _auth_manager = auth_manager + if db: + _auth_manager = APIKeyManager(db=db) + _billing = BillingTracker(db=db) + + return app diff --git a/reasonforge/gateway/auth.py b/reasonforge/gateway/auth.py new file mode 100644 index 0000000..8f3580c --- /dev/null +++ b/reasonforge/gateway/auth.py @@ -0,0 +1,91 @@ +""" +ReasonForge - API Authentication + +API key management with usage tracking and rate limits. +""" + +from __future__ import annotations + +import logging +import secrets +import uuid +from typing import Optional + +from .schemas import APIKeyInfo + +logger = logging.getLogger("reasonforge.gateway.auth") + +# Tier limits: requests per month +TIER_LIMITS = { + "free": 100, + "pro": 10_000, + "enterprise": 1_000_000, +} + + +class APIKeyManager: + """API key management with usage tracking and rate limits.""" + + def __init__(self, db=None): + self.db = db + + def create_key(self, owner: str, tier: str = "free") -> str: + """Generate a new API key.""" + key_id = str(uuid.uuid4()) + api_key = f"rf_{secrets.token_urlsafe(32)}" + limit = TIER_LIMITS.get(tier, 100) + + if self.db: + self.db.save_api_key( + key_id=key_id, + api_key=api_key, + owner=owner, + tier=tier, + limit=limit, + ) + + logger.info("Created API key for %s (tier=%s)", owner, tier) + return api_key + + def verify_key(self, key: str) -> Optional[APIKeyInfo]: + """Validate key and check rate limits.""" + if not self.db: + # If no DB, accept any key starting with rf_ + if key.startswith("rf_"): + return APIKeyInfo( + key_id="unknown", + owner="unknown", + tier="free", + requests_used=0, + requests_limit=100, + ) + return None + + key_data = self.db.get_api_key(key) + if not key_data: + return None + + info = APIKeyInfo( + key_id=key_data["key_id"], + owner=key_data["owner"], + tier=key_data["tier"], + requests_used=key_data["requests_used"], + requests_limit=key_data["requests_limit"], + ) + + # Check quota + if info.requests_used >= info.requests_limit: + logger.warning( + "API key %s exceeded quota (%d/%d)", + info.key_id, + info.requests_used, + info.requests_limit, + ) + return None + + return info + + def track_usage(self, key: str) -> None: + """Record API usage.""" + if self.db: + self.db.increment_api_usage(key) diff --git a/reasonforge/gateway/billing.py b/reasonforge/gateway/billing.py new file mode 100644 index 0000000..3091846 --- /dev/null +++ b/reasonforge/gateway/billing.py @@ -0,0 +1,69 @@ +""" +ReasonForge - Billing Module + +Usage tracking and quotas for API consumers. +""" + +from __future__ import annotations + +import logging +from dataclasses import dataclass, field +from datetime import datetime +from typing import Dict, List + +logger = logging.getLogger("reasonforge.gateway.billing") + + +@dataclass +class UsageRecord: + """Single API usage record.""" + + key_id: str + task_id: str + domain: str = "" + timestamp: str = field(default_factory=lambda: datetime.utcnow().isoformat()) + tokens_used: int = 0 + + +class BillingTracker: + """Track API usage for billing and quota enforcement.""" + + def __init__(self, db=None): + self.db = db + self._usage: Dict[str, List[UsageRecord]] = {} + + def record_usage( + self, + key_id: str, + task_id: str, + domain: str = "", + tokens_used: int = 0, + ) -> None: + """Record an API usage event.""" + record = UsageRecord( + key_id=key_id, + task_id=task_id, + domain=domain, + tokens_used=tokens_used, + ) + if key_id not in self._usage: + self._usage[key_id] = [] + self._usage[key_id].append(record) + + def get_usage_count(self, key_id: str) -> int: + """Get total usage count for a key.""" + return len(self._usage.get(key_id, [])) + + def get_usage_summary(self, key_id: str) -> Dict: + """Get usage summary for a key.""" + records = self._usage.get(key_id, []) + domains: Dict[str, int] = {} + for r in records: + domains[r.domain] = domains.get(r.domain, 0) + 1 + + return { + "key_id": key_id, + "total_requests": len(records), + "total_tokens": sum(r.tokens_used for r in records), + "by_domain": domains, + } diff --git a/reasonforge/gateway/rate_limiter.py b/reasonforge/gateway/rate_limiter.py new file mode 100644 index 0000000..7d532fd --- /dev/null +++ b/reasonforge/gateway/rate_limiter.py @@ -0,0 +1,77 @@ +""" +ReasonForge - Rate Limiter + +Token-bucket rate limiting for the API gateway. +""" + +from __future__ import annotations + +import time +from collections import defaultdict +from typing import Dict, List + + +class TokenBucketRateLimiter: + """Token-bucket rate limiter for API requests.""" + + def __init__( + self, + rate: float = 10.0, # Tokens per second + capacity: int = 100, # Max burst size + ): + self.rate = rate + self.capacity = capacity + self._buckets: Dict[str, Dict] = defaultdict( + lambda: {"tokens": capacity, "last_time": time.time()} + ) + + def allow(self, key: str) -> bool: + """Check if a request is allowed under rate limits.""" + bucket = self._buckets[key] + now = time.time() + elapsed = now - bucket["last_time"] + + # Refill tokens + bucket["tokens"] = min( + self.capacity, + bucket["tokens"] + elapsed * self.rate, + ) + bucket["last_time"] = now + + if bucket["tokens"] >= 1.0: + bucket["tokens"] -= 1.0 + return True + + return False + + def get_retry_after(self, key: str) -> float: + """Get seconds until the next token is available.""" + bucket = self._buckets[key] + if bucket["tokens"] >= 1.0: + return 0.0 + deficit = 1.0 - bucket["tokens"] + return deficit / self.rate + + def reset(self, key: str) -> None: + """Reset rate limit for a key.""" + if key in self._buckets: + del self._buckets[key] + + +class PerIPRateLimiter: + """Per-IP address rate limiting.""" + + def __init__(self, requests_per_minute: int = 60): + self.rpm = requests_per_minute + self._requests: Dict[str, List[float]] = defaultdict(list) + + def allow(self, ip: str) -> bool: + now = time.time() + # Clean old entries + self._requests[ip] = [t for t in self._requests[ip] if now - t < 60] + + if len(self._requests[ip]) >= self.rpm: + return False + + self._requests[ip].append(now) + return True diff --git a/reasonforge/gateway/schemas.py b/reasonforge/gateway/schemas.py new file mode 100644 index 0000000..c20cfa6 --- /dev/null +++ b/reasonforge/gateway/schemas.py @@ -0,0 +1,84 @@ +""" +ReasonForge - Gateway Request/Response Schemas + +Pydantic models for the external API gateway. +""" + +from __future__ import annotations + +from typing import Dict, List, Optional + +from pydantic import BaseModel, Field + + +class TaskSubmissionRequest(BaseModel): + """Request to submit a reasoning task to the network.""" + + problem: str = Field(..., min_length=10, max_length=10000) + domain: Optional[str] = None + difficulty: Optional[int] = Field(None, ge=1, le=10) + timeout_seconds: Optional[int] = Field(300, ge=30, le=600) + callback_url: Optional[str] = None + + +class TaskResultResponse(BaseModel): + """Response for task result queries.""" + + task_id: str + status: str # "queued" | "processing" | "completed" | "failed" + result: Optional[Dict] = None + best_answer: Optional[str] = None + confidence: Optional[float] = None + reasoning_steps: Optional[List[Dict]] = None + processing_time_ms: Optional[int] = None + + +class LeaderboardEntry(BaseModel): + """Single entry in the miner leaderboard.""" + + uid: int + s_epoch: float + peb: float + rank: int + streak: int + tasks_completed: int + + +class LeaderboardResponse(BaseModel): + """Miner leaderboard response.""" + + epoch_id: int + entries: List[LeaderboardEntry] + total_miners: int + + +class NetworkStatsResponse(BaseModel): + """Network statistics response.""" + + current_epoch: int + total_tasks_processed: int + active_miners: int + active_validators: int + avg_cms: float + total_emission_tao: float + top_domains: Dict[str, int] + + +class HealthResponse(BaseModel): + """Health check response.""" + + status: str + version: str + uptime_seconds: float + epoch: int + db_connected: bool + + +class APIKeyInfo(BaseModel): + """API key information.""" + + key_id: str + owner: str + tier: str + requests_used: int + requests_limit: int diff --git a/reasonforge/miner/__init__.py b/reasonforge/miner/__init__.py new file mode 100644 index 0000000..510a0b6 --- /dev/null +++ b/reasonforge/miner/__init__.py @@ -0,0 +1,16 @@ +""" +ReasonForge - Miner Module + +Provides the miner-side reasoning engine, LLM backends, and domain routing. +""" + +from .domain_router import DomainRouter +from .reasoning import ReasoningEngine, ReasoningResult +from .reasoning import ReasoningStep as MinerReasoningStep + +__all__ = [ + "ReasoningEngine", + "ReasoningResult", + "MinerReasoningStep", + "DomainRouter", +] diff --git a/reasonforge/miner/backends/__init__.py b/reasonforge/miner/backends/__init__.py new file mode 100644 index 0000000..90b671e --- /dev/null +++ b/reasonforge/miner/backends/__init__.py @@ -0,0 +1,9 @@ +""" +ReasonForge - LLM Backend Adapters + +Pluggable backends for miner reasoning: OpenAI, Anthropic, local, agent. +""" + +from .base import LLMBackend + +__all__ = ["LLMBackend"] diff --git a/reasonforge/miner/backends/agent_backend.py b/reasonforge/miner/backends/agent_backend.py new file mode 100644 index 0000000..5f6d940 --- /dev/null +++ b/reasonforge/miner/backends/agent_backend.py @@ -0,0 +1,128 @@ +""" +ReasonForge - Agent LLM Backend + +Supports LangGraph/LangChain multi-agent reasoning pipelines. +""" + +from __future__ import annotations + +import asyncio +import json +import logging +import os +from typing import Any, Dict, List, Optional + +from .base import LLMBackend + +logger = logging.getLogger("reasonforge.miner.agent") + + +class AgentBackend(LLMBackend): + """LangGraph/LangChain agent backend for multi-step reasoning.""" + + def __init__( + self, + model: str = "gpt-4o", + api_key: Optional[str] = None, + max_iterations: int = 10, + ): + self.model = model + self.api_key = api_key or os.environ.get("OPENAI_API_KEY", "") + self.max_iterations = max_iterations + self._agent: Any = None + + def _build_agent(self): + """Build a LangGraph reasoning agent.""" + if self._agent is not None: + return + + try: + from langchain_openai import ChatOpenAI + from langgraph.prebuilt import create_react_agent + + llm = ChatOpenAI( + model=self.model, + api_key=self.api_key, + temperature=0.7, + ) + + # Create a basic ReAct agent with reasoning tools + self._agent = create_react_agent(llm, tools=[]) + logger.info("Built LangGraph agent with model: %s", self.model) + + except ImportError: + raise ImportError( + "langchain/langgraph not installed. " + "Install with: pip install langchain-openai langgraph" + ) + + async def generate( + self, + messages: List[Dict[str, str]], + temperature: float = 0.7, + max_tokens: int = 4096, + timeout: int = 300, + ) -> str: + self._build_agent() + + try: + # Convert messages to LangGraph format + from langchain_core.messages import HumanMessage, SystemMessage + + lc_messages = [] + for msg in messages: + if msg["role"] == "system": + lc_messages.append(SystemMessage(content=msg["content"])) + else: + lc_messages.append(HumanMessage(content=msg["content"])) + + result = await asyncio.wait_for( + self._agent.ainvoke( + {"messages": lc_messages}, + ), + timeout=timeout, + ) + + # Extract final message + if result and "messages" in result: + return result["messages"][-1].content + return "" + + except asyncio.TimeoutError: + logger.warning("Agent execution timed out after %ds", timeout) + return "" + except Exception as e: + logger.error("Agent execution failed: %s", e) + return "" + + async def generate_structured( + self, + messages: List[Dict[str, str]], + schema: Dict[str, Any], + timeout: int = 300, + ) -> Dict[str, Any]: + messages_copy = messages.copy() + messages_copy[-1] = { + **messages_copy[-1], + "content": ( + messages_copy[-1]["content"] + + "\n\nRespond with ONLY valid JSON matching this schema:\n" + + json.dumps(schema, indent=2) + ), + } + text = await self.generate(messages_copy, temperature=0.3, timeout=timeout) + try: + start = text.find("{") + end = text.rfind("}") + 1 + if start >= 0 and end > start: + return json.loads(text[start:end]) + return {} + except json.JSONDecodeError: + return {} + + async def health_check(self) -> bool: + try: + self._build_agent() + return True + except Exception: + return False diff --git a/reasonforge/miner/backends/anthropic_backend.py b/reasonforge/miner/backends/anthropic_backend.py new file mode 100644 index 0000000..e9201eb --- /dev/null +++ b/reasonforge/miner/backends/anthropic_backend.py @@ -0,0 +1,121 @@ +""" +ReasonForge - Anthropic LLM Backend + +Supports Claude models via the Anthropic API. +""" + +from __future__ import annotations + +import asyncio +import json +import logging +import os +from typing import Any, Dict, List, Optional + +from .base import LLMBackend + +logger = logging.getLogger("reasonforge.miner.anthropic") + + +class AnthropicBackend(LLMBackend): + """Anthropic Claude API backend for miner reasoning.""" + + def __init__( + self, + model: str = "claude-sonnet-4-20250514", + api_key: Optional[str] = None, + ): + self.model = model + self.api_key = api_key or os.environ.get("ANTHROPIC_API_KEY", "") + self._client = None + + def _get_client(self): + if self._client is None: + try: + from anthropic import AsyncAnthropic + + self._client = AsyncAnthropic(api_key=self.api_key) + except ImportError: + raise ImportError( + "anthropic package not installed. Install with: pip install anthropic>=0.20.0" + ) + return self._client + + async def generate( + self, + messages: List[Dict[str, str]], + temperature: float = 0.7, + max_tokens: int = 4096, + timeout: int = 300, + ) -> str: + client = self._get_client() + + # Extract system message if present + system = "" + user_messages = [] + for msg in messages: + if msg["role"] == "system": + system = msg["content"] + else: + user_messages.append(msg) + + try: + kwargs = { + "model": self.model, + "messages": user_messages, + "max_tokens": max_tokens, + "temperature": temperature, + } + if system: + kwargs["system"] = system + + response = await asyncio.wait_for( + client.messages.create(**kwargs), + timeout=timeout, + ) + return response.content[0].text if response.content else "" + except asyncio.TimeoutError: + logger.warning("Anthropic request timed out after %ds", timeout) + return "" + except Exception as e: + logger.error("Anthropic request failed: %s", e) + return "" + + async def generate_structured( + self, + messages: List[Dict[str, str]], + schema: Dict[str, Any], + timeout: int = 300, + ) -> Dict[str, Any]: + messages_copy = messages.copy() + messages_copy[-1] = { + **messages_copy[-1], + "content": ( + messages_copy[-1]["content"] + + "\n\nRespond with ONLY valid JSON matching this schema:\n" + + json.dumps(schema, indent=2) + ), + } + + text = await self.generate(messages_copy, temperature=0.3, timeout=timeout) + try: + # Try to extract JSON from the response + start = text.find("{") + end = text.rfind("}") + 1 + if start >= 0 and end > start: + return json.loads(text[start:end]) + return {} + except json.JSONDecodeError: + logger.warning("Failed to parse Anthropic response as JSON") + return {} + + async def health_check(self) -> bool: + try: + result = await self.generate( + [{"role": "user", "content": "ping"}], + max_tokens=5, + timeout=10, + ) + return bool(result) + except Exception: + return False diff --git a/reasonforge/miner/backends/base.py b/reasonforge/miner/backends/base.py new file mode 100644 index 0000000..91351c3 --- /dev/null +++ b/reasonforge/miner/backends/base.py @@ -0,0 +1,59 @@ +""" +ReasonForge - Abstract LLM Backend + +All LLM backends must implement this interface. +""" + +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import Any, Dict, List + + +class LLMBackend(ABC): + """Abstract base class for LLM backends.""" + + @abstractmethod + async def generate( + self, + messages: List[Dict[str, str]], + temperature: float = 0.7, + max_tokens: int = 4096, + timeout: int = 300, + ) -> str: + """Generate a text completion from the LLM. + + Args: + messages: List of {role, content} message dicts. + temperature: Sampling temperature. + max_tokens: Maximum tokens to generate. + timeout: Timeout in seconds. + + Returns: + Generated text string. + """ + ... + + @abstractmethod + async def generate_structured( + self, + messages: List[Dict[str, str]], + schema: Dict[str, Any], + timeout: int = 300, + ) -> Dict[str, Any]: + """Generate a structured JSON response conforming to schema. + + Args: + messages: List of {role, content} message dicts. + schema: JSON schema for the expected response. + timeout: Timeout in seconds. + + Returns: + Parsed JSON dict matching the schema. + """ + ... + + @abstractmethod + async def health_check(self) -> bool: + """Check if the backend is available and responding.""" + ... diff --git a/reasonforge/miner/backends/local_backend.py b/reasonforge/miner/backends/local_backend.py new file mode 100644 index 0000000..1371050 --- /dev/null +++ b/reasonforge/miner/backends/local_backend.py @@ -0,0 +1,163 @@ +""" +ReasonForge - Local LLM Backend + +Supports HuggingFace transformers and vLLM for direct GPU inference. +""" + +from __future__ import annotations + +import asyncio +import json +import logging +from typing import Any, Dict, List + +from .base import LLMBackend + +logger = logging.getLogger("reasonforge.miner.local") + + +class LocalBackend(LLMBackend): + """Local transformers/vLLM backend for miner reasoning.""" + + def __init__( + self, + model: str = "meta-llama/Meta-Llama-3-8B-Instruct", + device: str = "auto", + use_vllm: bool = False, + ): + self.model_name = model + self.device = device + self.use_vllm = use_vllm + self._pipeline: Any = None + self._vllm_model: Any = None + + def _load_model(self): + """Lazy-load the model.""" + if self.use_vllm: + self._load_vllm() + else: + self._load_transformers() + + def _load_transformers(self): + if self._pipeline is not None: + return + try: + import torch + from transformers import pipeline + + self._pipeline = pipeline( + "text-generation", + model=self.model_name, + device_map=self.device, + torch_dtype=torch.float16, + ) + logger.info("Loaded transformers model: %s", self.model_name) + except ImportError: + raise ImportError( + "transformers package not installed. Install with: pip install transformers torch" + ) + + def _load_vllm(self): + if self._vllm_model is not None: + return + try: + from vllm import LLM + + self._vllm_model = LLM(model=self.model_name) + logger.info("Loaded vLLM model: %s", self.model_name) + except ImportError: + raise ImportError("vllm package not installed. Install with: pip install vllm") + + def _format_messages(self, messages: List[Dict[str, str]]) -> str: + """Format chat messages into a prompt string.""" + parts = [] + for msg in messages: + role = msg["role"] + content = msg["content"] + if role == "system": + parts.append(f"<|system|>\n{content}\n") + elif role == "user": + parts.append(f"<|user|>\n{content}\n") + elif role == "assistant": + parts.append(f"<|assistant|>\n{content}\n") + parts.append("<|assistant|>\n") + return "".join(parts) + + async def generate( + self, + messages: List[Dict[str, str]], + temperature: float = 0.7, + max_tokens: int = 4096, + timeout: int = 300, + ) -> str: + self._load_model() + prompt = self._format_messages(messages) + + loop = asyncio.get_event_loop() + + try: + if self.use_vllm: + from vllm import SamplingParams + + params = SamplingParams( + temperature=temperature, + max_tokens=max_tokens, + ) + result = await asyncio.wait_for( + loop.run_in_executor(None, lambda: self._vllm_model.generate([prompt], params)), + timeout=timeout, + ) + return result[0].outputs[0].text + else: + result = await asyncio.wait_for( + loop.run_in_executor( + None, + lambda: self._pipeline( + prompt, + max_new_tokens=max_tokens, + temperature=temperature, + do_sample=True, + return_full_text=False, + ), + ), + timeout=timeout, + ) + return result[0]["generated_text"] + except asyncio.TimeoutError: + logger.warning("Local generation timed out after %ds", timeout) + return "" + except Exception as e: + logger.error("Local generation failed: %s", e) + return "" + + async def generate_structured( + self, + messages: List[Dict[str, str]], + schema: Dict[str, Any], + timeout: int = 300, + ) -> Dict[str, Any]: + messages_copy = messages.copy() + messages_copy[-1] = { + **messages_copy[-1], + "content": ( + messages_copy[-1]["content"] + + "\n\nRespond with ONLY valid JSON:\n" + + json.dumps(schema, indent=2) + ), + } + text = await self.generate(messages_copy, temperature=0.3, timeout=timeout) + try: + start = text.find("{") + end = text.rfind("}") + 1 + if start >= 0 and end > start: + return json.loads(text[start:end]) + return {} + except json.JSONDecodeError: + return {} + + async def health_check(self) -> bool: + try: + self._load_model() + return True + except Exception: + return False diff --git a/reasonforge/miner/backends/openai_backend.py b/reasonforge/miner/backends/openai_backend.py new file mode 100644 index 0000000..8cfb053 --- /dev/null +++ b/reasonforge/miner/backends/openai_backend.py @@ -0,0 +1,126 @@ +""" +ReasonForge - OpenAI/Compatible LLM Backend + +Supports OpenAI API, Azure OpenAI, DeepSeek, local vLLM, and any +OpenAI-compatible endpoint. +""" + +from __future__ import annotations + +import asyncio +import json +import logging +import os +from typing import Any, Dict, List, Optional + +from .base import LLMBackend + +logger = logging.getLogger("reasonforge.miner.openai") + + +class OpenAIBackend(LLMBackend): + """OpenAI API backend for miner reasoning.""" + + def __init__( + self, + model: str = "gpt-4o", + api_key: Optional[str] = None, + base_url: Optional[str] = None, + ): + self.model = model + self.api_key = api_key or os.environ.get("OPENAI_API_KEY", "") + self.base_url = base_url + self._client = None + + def _get_client(self): + """Lazy-initialize the OpenAI client.""" + if self._client is None: + try: + from openai import AsyncOpenAI + + kwargs = {"api_key": self.api_key} + if self.base_url: + kwargs["base_url"] = self.base_url + self._client = AsyncOpenAI(**kwargs) + except ImportError: + raise ImportError( + "openai package not installed. Install with: pip install openai>=1.0.0" + ) + return self._client + + async def generate( + self, + messages: List[Dict[str, str]], + temperature: float = 0.7, + max_tokens: int = 4096, + timeout: int = 300, + ) -> str: + client = self._get_client() + try: + response = await asyncio.wait_for( + client.chat.completions.create( + model=self.model, + messages=messages, + temperature=temperature, + max_tokens=max_tokens, + ), + timeout=timeout, + ) + return response.choices[0].message.content or "" + except asyncio.TimeoutError: + logger.warning("OpenAI request timed out after %ds", timeout) + return "" + except Exception as e: + logger.error("OpenAI request failed: %s", e) + return "" + + async def generate_structured( + self, + messages: List[Dict[str, str]], + schema: Dict[str, Any], + timeout: int = 300, + ) -> Dict[str, Any]: + client = self._get_client() + try: + # Request JSON mode + messages_with_json = messages.copy() + messages_with_json[-1]["content"] += ( + "\n\nRespond with valid JSON matching this schema:\n" + json.dumps(schema, indent=2) + ) + + response = await asyncio.wait_for( + client.chat.completions.create( + model=self.model, + messages=messages_with_json, + temperature=0.3, + max_tokens=4096, + response_format={"type": "json_object"}, + ), + timeout=timeout, + ) + content = response.choices[0].message.content or "{}" + return json.loads(content) + except asyncio.TimeoutError: + logger.warning("OpenAI structured request timed out") + return {} + except json.JSONDecodeError: + logger.warning("Failed to parse structured response as JSON") + return {} + except Exception as e: + logger.error("OpenAI structured request failed: %s", e) + return {} + + async def health_check(self) -> bool: + try: + client = self._get_client() + response = await asyncio.wait_for( + client.chat.completions.create( + model=self.model, + messages=[{"role": "user", "content": "ping"}], + max_tokens=5, + ), + timeout=10, + ) + return bool(response.choices) + except Exception: + return False diff --git a/reasonforge/miner/domain_router.py b/reasonforge/miner/domain_router.py new file mode 100644 index 0000000..13ec049 --- /dev/null +++ b/reasonforge/miner/domain_router.py @@ -0,0 +1,160 @@ +""" +ReasonForge - Domain Router + +Maps reasoning domains to specialized system prompts and output parsers. +""" + +from __future__ import annotations + +from typing import Dict + +from ..types import Domain + +# ────────────────────────────────────────────── +# Domain-Specific System Prompts +# ────────────────────────────────────────────── + +DOMAIN_PROMPTS: Dict[str, str] = { + Domain.MATHEMATICS: """You are a mathematical reasoning engine. For each step in your solution: +1. State your approach clearly +2. Show all formal work with explicit derivations +3. Verify each step's correctness +4. If possible, express proofs in Lean 4 syntax fragments + +Structure your response as a series of numbered reasoning steps, each with: +- reasoning: Your detailed work for this step +- evidence: Mathematical justification or references +- confidence: Your confidence in this step (0.0 to 1.0) +- formal_proof_fragment: Optional Lean 4 proof fragment + +Conclude with a clear final answer.""", + Domain.CODE: """You are a code reasoning engine. For each step: +1. Analyze the requirements and constraints +2. Design the solution approach with complexity analysis +3. Implement with clean, well-documented code +4. Include test cases that verify correctness + +Structure your response as reasoning steps covering: +- Problem analysis and edge cases +- Algorithm design with time/space complexity +- Implementation with inline comments +- Test cases and verification + +Your code_artifact should contain the complete, runnable solution.""", + Domain.SCIENTIFIC: """You are a scientific reasoning engine. For each step: +1. Formulate hypotheses based on known principles +2. Design the analytical approach or simulation +3. Execute calculations with proper units +4. Validate results against known benchmarks or constraints + +Structure your response with steps covering: +- Problem formulation and relevant theory +- Methodology and approach +- Calculations and results +- Validation and uncertainty analysis""", + Domain.STRATEGIC: """You are a strategic reasoning engine specializing in game theory and optimization. For each step: +1. Model the problem formally (players, strategies, payoffs) +2. Identify equilibrium concepts or optimization framework +3. Solve using appropriate methods (LP, Nash, etc.) +4. Verify the solution and analyze sensitivity + +Structure your response with steps covering: +- Problem formalization +- Solution methodology +- Detailed computation +- Solution verification and interpretation""", + Domain.CAUSAL: """You are a causal reasoning engine. For each step: +1. Construct or analyze the causal DAG +2. Identify confounders, mediators, and instruments +3. Apply do-calculus or appropriate identification strategy +4. Derive the causal estimand + +Structure your response with steps covering: +- Causal graph specification +- Identification strategy (backdoor, frontdoor, IV) +- Formal derivation +- Interpretation and assumptions""", + Domain.ETHICAL: """You are an ethical reasoning engine. For each step: +1. Identify the key moral dimensions and stakeholders +2. Apply multiple ethical frameworks (utilitarian, deontological, virtue ethics, etc.) +3. Analyze tensions and trade-offs between perspectives +4. Synthesize a nuanced conclusion + +Structure your response with steps covering: +- Stakeholder and issue analysis +- Framework application (minimum 3 frameworks) +- Comparative analysis of perspectives +- Balanced conclusion with justified reasoning""", +} + +# Output schema for structured reasoning +REASONING_SCHEMA = { + "type": "object", + "properties": { + "steps": { + "type": "array", + "items": { + "type": "object", + "properties": { + "step_id": {"type": "integer"}, + "reasoning": {"type": "string"}, + "evidence": {"type": "string"}, + "confidence": {"type": "number", "minimum": 0, "maximum": 1}, + "formal_proof_fragment": {"type": "string"}, + }, + "required": ["step_id", "reasoning", "confidence"], + }, + }, + "final_answer": {"type": "string"}, + "proof_status": {"type": "string", "enum": ["VERIFIED", "FAILED", "NONE"]}, + }, + "required": ["steps", "final_answer"], +} + + +class DomainRouter: + """Routes tasks to domain-specialized prompts and parsers.""" + + def __init__(self, supported_domains: list[str] | None = None): + self.supported_domains = supported_domains or list(DOMAIN_PROMPTS.keys()) + + def get_system_prompt(self, domain: str) -> str: + """Get the system prompt for a domain.""" + # Handle both Domain enum and string + domain_key = domain if isinstance(domain, str) else domain.value + for key, prompt in DOMAIN_PROMPTS.items(): + key_val = key.value if hasattr(key, "value") else key + if key_val == domain_key: + return prompt + # Default fallback + return DOMAIN_PROMPTS[Domain.MATHEMATICS] + + def build_prompt( + self, + problem: str, + domain: str, + difficulty: int, + context: str | None = None, + constraints: str | None = None, + ) -> str: + """Build the full user prompt for a task.""" + parts = [f"**Problem (Difficulty {difficulty}/10):**\n{problem}"] + if context: + parts.append(f"\n**Context:**\n{context}") + if constraints: + parts.append(f"\n**Constraints:**\n{constraints}") + parts.append( + "\n\nProvide your solution as a structured chain of reasoning steps. " + "Each step should include your reasoning, supporting evidence, and confidence level." + ) + return "\n".join(parts) + + def get_schema(self) -> dict: + """Get the output schema for structured responses.""" + return REASONING_SCHEMA + + def supports_domain(self, domain: str) -> bool: + """Check if this router supports a given domain.""" + domain_val = domain.value if hasattr(domain, "value") else domain + supported_vals = [d.value if hasattr(d, "value") else d for d in self.supported_domains] + return domain_val in supported_vals diff --git a/reasonforge/miner/proof_generator.py b/reasonforge/miner/proof_generator.py new file mode 100644 index 0000000..0b7c152 --- /dev/null +++ b/reasonforge/miner/proof_generator.py @@ -0,0 +1,87 @@ +""" +ReasonForge - Proof Generator + +Generates formal proof fragments for mathematical and code domains. +Attempts to produce Lean 4 syntax when applicable. +""" + +from __future__ import annotations + +import logging +import re +from dataclasses import dataclass +from typing import Optional + +logger = logging.getLogger("reasonforge.miner.proof") + + +@dataclass +class ProofResult: + """Result of a proof generation attempt.""" + + status: str = "NONE" # "VERIFIED" | "FAILED" | "NONE" + artifact: Optional[str] = None # Base64-encoded proof file + fragments: Optional[list[str]] = None # Per-step proof fragments + + def __post_init__(self): + if self.fragments is None: + self.fragments = [] + + +class ProofGenerator: + """Generates formal proof fragments from reasoning steps.""" + + # Patterns that suggest formal proof content + LEAN4_PATTERNS = [ + r"theorem\s+\w+", + r"lemma\s+\w+", + r"def\s+\w+", + r"by\s+(simp|ring|omega|linarith|norm_num|decide)", + r"#check\s+", + r"example\s*:", + ] + + def extract_proof_fragments(self, reasoning_text: str) -> list[str]: + """Extract any formal proof fragments from reasoning text.""" + fragments = [] + + # Look for code blocks that might contain Lean 4 + code_blocks = re.findall(r"```(?:lean4?|proof)?\n(.*?)```", reasoning_text, re.DOTALL) + for block in code_blocks: + if any(re.search(p, block) for p in self.LEAN4_PATTERNS): + fragments.append(block.strip()) + + return fragments + + def assess_proof_status(self, fragments: list[str], final_answer: str) -> str: + """Assess the proof status based on available fragments.""" + if not fragments: + return "NONE" + # If we have fragments, we mark as potentially verifiable + # Actual verification happens on the validator side via Lean4Checker + return "VERIFIED" if len(fragments) >= 2 else "NONE" + + def generate(self, reasoning_steps: list[dict], final_answer: str) -> ProofResult: + """Generate proof result from reasoning steps.""" + all_fragments = [] + for step in reasoning_steps: + reasoning = step.get("reasoning", "") + fragments = self.extract_proof_fragments(reasoning) + all_fragments.extend(fragments) + if fragments: + step["formal_proof_fragment"] = fragments[0] + + status = self.assess_proof_status(all_fragments, final_answer) + + artifact = None + if all_fragments: + import base64 + + combined = "\n\n".join(all_fragments) + artifact = base64.b64encode(combined.encode()).decode() + + return ProofResult( + status=status, + artifact=artifact, + fragments=all_fragments, + ) diff --git a/reasonforge/miner/reasoning.py b/reasonforge/miner/reasoning.py new file mode 100644 index 0000000..2fa9940 --- /dev/null +++ b/reasonforge/miner/reasoning.py @@ -0,0 +1,247 @@ +""" +ReasonForge - Reasoning Engine + +Orchestrates multi-step reasoning using pluggable LLM backends. +Routes tasks through domain-specific prompts and parses structured output. +""" + +from __future__ import annotations + +import logging +import time +from dataclasses import dataclass, field +from typing import List, Optional + +from .backends.base import LLMBackend +from .domain_router import DomainRouter +from .proof_generator import ProofGenerator + +logger = logging.getLogger("reasonforge.miner.reasoning") + + +@dataclass +class ReasoningStep: + """A single step in the reasoning chain.""" + + step_id: int = 0 + reasoning: str = "" + evidence: str = "" + confidence: float = 0.0 + formal_proof_fragment: Optional[str] = None + + +@dataclass +class ReasoningResult: + """Complete result of a reasoning task.""" + + steps: List[ReasoningStep] = field(default_factory=list) + final_answer: str = "" + proof_status: Optional[str] = None + proof_artifact: Optional[str] = None + code_artifact: Optional[str] = None + time_taken_ms: int = 0 + + +class ReasoningEngine: + """ + Orchestrates multi-step reasoning: + 1. Build domain-specific system prompt + 2. Request chain-of-thought from LLM + 3. Parse structured reasoning steps + 4. Attempt formal proof generation (math/code domains) + 5. Return structured result + """ + + def __init__( + self, + backend: str = "openai", + model: str = "gpt-4o", + domains: list[str] | None = None, + api_key: str | None = None, + ): + self.domain_router = DomainRouter(domains) + self.proof_generator = ProofGenerator() + self.backend = self._create_backend(backend, model, api_key) + + def _create_backend( + self, backend_type: str, model: str, api_key: str | None = None + ) -> LLMBackend: + """Create the appropriate LLM backend.""" + if backend_type == "openai": + from .backends.openai_backend import OpenAIBackend + + return OpenAIBackend(model=model, api_key=api_key) + elif backend_type == "anthropic": + from .backends.anthropic_backend import AnthropicBackend + + return AnthropicBackend(model=model, api_key=api_key) + elif backend_type == "local": + from .backends.local_backend import LocalBackend + + return LocalBackend(model=model) + elif backend_type == "agent": + from .backends.agent_backend import AgentBackend + + return AgentBackend(model=model, api_key=api_key) + else: + raise ValueError(f"Unknown backend type: {backend_type}") + + async def solve( + self, + problem: str, + domain: str, + difficulty: int = 5, + context: str | None = None, + constraints: str | None = None, + timeout: int = 300, + ) -> ReasoningResult: + """Execute multi-step reasoning for a task.""" + start_time = time.time_ns() + + # 1. Build prompts + system_prompt = self.domain_router.get_system_prompt(domain) + user_prompt = self.domain_router.build_prompt( + problem, domain, difficulty, context, constraints + ) + + messages = [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_prompt}, + ] + + # 2. Try structured generation first + schema = self.domain_router.get_schema() + parsed = await self.backend.generate_structured(messages, schema, timeout=timeout) + + # 3. Parse response + steps = [] + final_answer = "" + code_artifact = None + + if parsed and "steps" in parsed: + for i, step_data in enumerate(parsed["steps"]): + steps.append( + ReasoningStep( + step_id=step_data.get("step_id", i), + reasoning=step_data.get("reasoning", ""), + evidence=step_data.get("evidence", ""), + confidence=float(step_data.get("confidence", 0.5)), + formal_proof_fragment=step_data.get("formal_proof_fragment"), + ) + ) + final_answer = parsed.get("final_answer", "") + else: + # Fallback: generate free-form and parse + raw_text = await self.backend.generate(messages, timeout=timeout) + steps, final_answer = self._parse_freeform(raw_text) + + # 4. Extract code artifact if applicable + domain_val = domain.value if hasattr(domain, "value") else domain + if domain_val == "code": + code_artifact = self._extract_code_artifact( + [s.reasoning for s in steps] + [final_answer] + ) + + # 5. Attempt proof generation + steps_dicts = [ + { + "step_id": s.step_id, + "reasoning": s.reasoning, + "evidence": s.evidence, + "confidence": s.confidence, + "formal_proof_fragment": s.formal_proof_fragment, + } + for s in steps + ] + proof_result = self.proof_generator.generate(steps_dicts, final_answer) + + # Update steps with any proof fragments found + for i, step_dict in enumerate(steps_dicts): + if i < len(steps) and step_dict.get("formal_proof_fragment"): + steps[i].formal_proof_fragment = str(step_dict["formal_proof_fragment"]) + + elapsed_ms = int((time.time_ns() - start_time) / 1_000_000) + + return ReasoningResult( + steps=steps, + final_answer=final_answer, + proof_status=proof_result.status, + proof_artifact=proof_result.artifact, + code_artifact=code_artifact, + time_taken_ms=elapsed_ms, + ) + + def _parse_freeform(self, text: str) -> tuple[list[ReasoningStep], str]: + """Parse free-form LLM output into structured steps.""" + if not text: + return [], "" + + lines = text.strip().split("\n") + steps = [] + current_step: list[str] = [] + step_id = 0 + + for line in lines: + # Detect step boundaries (numbered lines, "Step X:", etc.) + stripped = line.strip() + if ( + stripped and stripped[0].isdigit() and "." in stripped[:4] + ) or stripped.lower().startswith("step "): + if current_step: + steps.append( + ReasoningStep( + step_id=step_id, + reasoning="\n".join(current_step), + confidence=0.5, + ) + ) + step_id += 1 + current_step = [] + current_step.append(line) + + # Last step becomes final answer if no explicit answer section + if current_step: + text_block = "\n".join(current_step) + if any( + marker in text_block.lower() + for marker in ["final answer", "therefore", "conclusion", "answer:"] + ): + final_answer = text_block + if not steps: + steps.append( + ReasoningStep( + step_id=0, + reasoning=text_block, + confidence=0.5, + ) + ) + else: + steps.append( + ReasoningStep( + step_id=step_id, + reasoning=text_block, + confidence=0.5, + ) + ) + final_answer = text_block + + if not steps: + steps = [ReasoningStep(step_id=0, reasoning=text, confidence=0.3)] + final_answer = text + + return steps, final_answer if "final_answer" in dir() else steps[-1].reasoning + + def _extract_code_artifact(self, texts: list[str]) -> str | None: + """Extract code blocks from reasoning text.""" + import base64 + import re + + for text in texts: + code_blocks = re.findall( + r"```(?:python|javascript|java|cpp|c\+\+|rust|go)?\n(.*?)```", text, re.DOTALL + ) + if code_blocks: + # Return the longest code block as the artifact + longest = max(code_blocks, key=len) + return base64.b64encode(longest.strip().encode()).decode() + return None diff --git a/reasonforge/monitoring/__init__.py b/reasonforge/monitoring/__init__.py new file mode 100644 index 0000000..0949437 --- /dev/null +++ b/reasonforge/monitoring/__init__.py @@ -0,0 +1,11 @@ +""" +ReasonForge - Monitoring & Observability + +Prometheus metrics, structured logging, and health checks. +""" + +from .health import HealthChecker +from .logger import setup_logging +from .metrics import MetricsCollector + +__all__ = ["MetricsCollector", "setup_logging", "HealthChecker"] diff --git a/reasonforge/monitoring/health.py b/reasonforge/monitoring/health.py new file mode 100644 index 0000000..8e0138e --- /dev/null +++ b/reasonforge/monitoring/health.py @@ -0,0 +1,86 @@ +""" +ReasonForge - Health Check + +Health check utilities for neurons and services. +""" + +from __future__ import annotations + +import logging +import time +from dataclasses import dataclass, field +from typing import Callable, Dict, List + +logger = logging.getLogger("reasonforge.monitoring.health") + + +@dataclass +class HealthStatus: + """Health status of a neuron or service.""" + + healthy: bool = True + version: str = "0.1.0" + uptime_seconds: float = 0.0 + checks: Dict[str, bool] = field(default_factory=dict) + errors: List[str] = field(default_factory=list) + + +class HealthChecker: + """Perform health checks on neuron components.""" + + def __init__(self): + self._start_time = time.time() + self._checks: Dict[str, Callable[[], bool]] = {} + + def register_check(self, name: str, check_fn: Callable[[], bool]) -> None: + """Register a health check function. Should return bool.""" + self._checks[name] = check_fn + + def check(self) -> HealthStatus: + """Run all registered health checks.""" + status = HealthStatus( + uptime_seconds=time.time() - self._start_time, + ) + + for name, check_fn in self._checks.items(): + try: + result = check_fn() + status.checks[name] = bool(result) + if not result: + status.healthy = False + status.errors.append(f"{name}: unhealthy") + except Exception as e: + status.checks[name] = False + status.healthy = False + status.errors.append(f"{name}: {str(e)}") + + return status + + def check_bittensor(self, subtensor) -> bool: + """Check bittensor connectivity.""" + if subtensor is None: + return False + try: + block = subtensor.get_current_block() + return block > 0 + except Exception: + return False + + def check_database(self, db) -> bool: + """Check database connectivity.""" + if db is None: + return False + try: + db.get_stats() + return True + except Exception: + return False + + def check_axon(self, axon) -> bool: + """Check if axon is serving.""" + if axon is None: + return False + try: + return axon.is_serving + except Exception: + return True # Assume serving if we can't check diff --git a/reasonforge/monitoring/logger.py b/reasonforge/monitoring/logger.py new file mode 100644 index 0000000..c888fa6 --- /dev/null +++ b/reasonforge/monitoring/logger.py @@ -0,0 +1,87 @@ +""" +ReasonForge - Structured Logging + +Configure structured JSON logging using structlog. +Falls back to standard logging if structlog is not available. +""" + +from __future__ import annotations + +import logging +import sys +from typing import Optional + +try: + import structlog + + HAS_STRUCTLOG = True +except ImportError: + HAS_STRUCTLOG = False + + +def setup_logging( + neuron_type: str = "validator", + uid: int = 0, + debug: bool = False, + log_file: Optional[str] = None, +) -> logging.Logger: + """ + Configure structured JSON logging. + + Uses structlog if available, otherwise falls back to standard logging. + """ + level = logging.DEBUG if debug else logging.INFO + + if HAS_STRUCTLOG: + structlog.configure( + processors=[ + structlog.contextvars.merge_contextvars, + structlog.stdlib.add_log_level, + structlog.stdlib.add_logger_name, + structlog.processors.TimeStamper(fmt="iso"), + structlog.processors.StackInfoRenderer(), + structlog.processors.format_exc_info, + structlog.processors.JSONRenderer(), + ], + context_class=dict, + logger_factory=structlog.stdlib.LoggerFactory(), + wrapper_class=structlog.stdlib.BoundLogger, + cache_logger_on_first_use=True, + ) + + log = structlog.get_logger() + log = log.bind(neuron_type=neuron_type, uid=uid) + + # Also configure stdlib logging for libraries + logging.basicConfig( + level=level, + format="%(message)s", + stream=sys.stdout, + ) + + return log + else: + # Fallback: standard logging + formatter = logging.Formatter("%(asctime)s | %(name)s | %(levelname)s | %(message)s") + + # Console handler + console_handler = logging.StreamHandler(sys.stdout) + console_handler.setFormatter(formatter) + console_handler.setLevel(level) + + # File handler + handlers: list = [console_handler] + if log_file: + file_handler = logging.FileHandler(log_file) + file_handler.setFormatter(formatter) + file_handler.setLevel(level) + handlers.append(file_handler) + + logging.basicConfig( + level=level, + handlers=handlers, + ) + + log = logging.getLogger(f"reasonforge.{neuron_type}") + log.info("Logging configured (neuron=%s, uid=%d)", neuron_type, uid) + return log diff --git a/reasonforge/monitoring/metrics.py b/reasonforge/monitoring/metrics.py new file mode 100644 index 0000000..42de855 --- /dev/null +++ b/reasonforge/monitoring/metrics.py @@ -0,0 +1,140 @@ +""" +ReasonForge - Prometheus Metrics + +Counters, histograms, and gauges for subnet monitoring. +""" + +from __future__ import annotations + +import logging +from typing import Optional + +logger = logging.getLogger("reasonforge.monitoring.metrics") + +try: + from prometheus_client import Counter, Gauge, Histogram, start_http_server + + HAS_PROMETHEUS = True +except ImportError: + HAS_PROMETHEUS = False + + +class MetricsCollector: + """Prometheus metrics for subnet monitoring.""" + + def __init__(self, neuron_type: str = "validator", uid: int = 0): + self.neuron_type = neuron_type + self.uid = uid + self._initialized = False + + if not HAS_PROMETHEUS: + logger.debug("prometheus_client not available, metrics disabled") + return + + prefix = f"reasonforge_{neuron_type}" + + # Counters + self.tasks_processed = Counter( + f"{prefix}_tasks_total", + "Total tasks processed", + ["domain", "difficulty"], + ) + self.epochs_completed = Counter( + f"{prefix}_epochs_total", + "Epochs completed", + ) + self.traps_injected = Counter( + f"{prefix}_traps_total", + "Trap problems injected", + ) + self.breakthroughs = Counter( + f"{prefix}_breakthroughs_total", + "Breakthrough solutions", + ) + self.plagiarism_detected = Counter( + f"{prefix}_plagiarism_total", + "Plagiarism detections", + ) + self.weight_set_failures = Counter( + f"{prefix}_weight_failures_total", + "Weight setting failures", + ) + + # Histograms + self.task_latency = Histogram( + f"{prefix}_task_latency_seconds", + "Task processing time", + ["domain"], + ) + self.cms_distribution = Histogram( + f"{prefix}_cms_score", + "CMS score distribution", + buckets=[0.1 * i for i in range(11)], + ) + + # Gauges + self.current_epoch = Gauge( + f"{prefix}_current_epoch", + "Current epoch number", + ) + self.active_miners = Gauge( + f"{prefix}_active_miners", + "Number of active miners", + ) + self.avg_cms = Gauge( + f"{prefix}_avg_cms", + "Average CMS this epoch", + ) + self.total_emission = Gauge( + f"{prefix}_total_emission_tao", + "Total TAO emitted", + ) + self.top_miner_score = Gauge( + f"{prefix}_top_miner_score", + "Highest S_epoch", + ) + + self._initialized = True + + def start_server(self, port: Optional[int] = None) -> None: + """Start Prometheus metrics HTTP server.""" + if not HAS_PROMETHEUS or not self._initialized: + return + port = port or (9090 + self.uid) + try: + start_http_server(port) + logger.info("Metrics server started on port %d", port) + except Exception as e: + logger.warning("Failed to start metrics server: %s", e) + + def record_task(self, domain: str, difficulty: int, latency_s: float) -> None: + if not self._initialized: + return + self.tasks_processed.labels(domain=domain, difficulty=str(difficulty)).inc() + self.task_latency.labels(domain=domain).observe(latency_s) + + def record_epoch( + self, epoch_id: int, n_miners: int, avg_score: float, top_score: float + ) -> None: + if not self._initialized: + return + self.epochs_completed.inc() + self.current_epoch.set(epoch_id) + self.active_miners.set(n_miners) + self.avg_cms.set(avg_score) + self.top_miner_score.set(top_score) + + def record_cms(self, score: float) -> None: + if not self._initialized: + return + self.cms_distribution.observe(score) + + def record_plagiarism(self) -> None: + if not self._initialized: + return + self.plagiarism_detected.inc() + + def record_weight_failure(self) -> None: + if not self._initialized: + return + self.weight_set_failures.inc() diff --git a/reasonforge/plagiarism.py b/reasonforge/plagiarism.py index d20bc8b..36a9bb9 100644 --- a/reasonforge/plagiarism.py +++ b/reasonforge/plagiarism.py @@ -8,9 +8,9 @@ from __future__ import annotations from collections import deque -from typing import Deque, Dict, List, Set, Tuple +from typing import Deque, List, Optional, Set, Tuple -from .types import MinerSubmission, SIMILARITY_THRESHOLD, SIMILARITY_PENALTY +from .types import SIMILARITY_PENALTY, SIMILARITY_THRESHOLD, MinerSubmission class PlagiarismDetector: @@ -51,7 +51,11 @@ def _jaccard_similarity(self, set_a: Set[str], set_b: Set[str]) -> float: return 0.0 return intersection / union - def check(self, submission: MinerSubmission, current_submissions: List[MinerSubmission] = None) -> float: + def check( + self, + submission: MinerSubmission, + current_submissions: Optional[List[MinerSubmission]] = None, + ) -> float: """ Check a submission for plagiarism against history and current epoch. diff --git a/reasonforge/protocol.py b/reasonforge/protocol.py new file mode 100644 index 0000000..75c19ca --- /dev/null +++ b/reasonforge/protocol.py @@ -0,0 +1,187 @@ +""" +ReasonForge - Bittensor Wire Protocol + +Defines all Synapse subclasses for validator<->miner communication. +Gracefully degrades when bittensor is not installed (for testing). +""" + +from __future__ import annotations + +import hashlib +import json +import uuid +from typing import List, Optional + +from pydantic import BaseModel, Field + +# ────────────────────────────────────────────── +# Synapse base: use real bt.Synapse if available, else a Pydantic shim +# ────────────────────────────────────────────── + + +class _DendriteMetadata(BaseModel): + hotkey: str = "" + ip: str = "" + port: int = 0 + + +class _AxonMetadata(BaseModel): + hotkey: str = "" + ip: str = "" + port: int = 0 + + +class _SynapseShim(BaseModel): + """Minimal Synapse shim for environments without bittensor.""" + + class Config: + arbitrary_types_allowed = True + + dendrite: Optional[_DendriteMetadata] = None + axon: Optional[_AxonMetadata] = None + + def deserialize(self) -> dict: + return self.model_dump() + + def to_headers(self) -> dict: + return {} + + def body_hash(self) -> str: + data = json.dumps( + { + k: v + for k, v in self.model_dump().items() + if k in getattr(self, "required_hash_fields", []) + }, + sort_keys=True, + ) + return hashlib.sha256(data.encode()).hexdigest() + + +try: + import bittensor as bt + + SynapseBase = bt.Synapse +except ImportError: + SynapseBase = _SynapseShim # type: ignore[misc, assignment] + + +# ────────────────────────────────────────────── +# Synapse Definitions +# ────────────────────────────────────────────── + + +class ReasoningTask(SynapseBase): # type: ignore[valid-type, misc] + """Validator -> Miner: Here is a reasoning task to solve.""" + + # -- Immutable fields (set by validator, read by miner) -- + task_id: str = "" + problem: str = "" + domain: str = "" + difficulty: int = Field(default=5, ge=1, le=10) + timeout_seconds: int = 300 + context: Optional[str] = None + constraints: Optional[str] = None + + # -- Mutable fields (filled by miner, read back by validator) -- + reasoning_steps: Optional[List[dict]] = None + final_answer: Optional[str] = None + proof_status: Optional[str] = None + proof_artifact: Optional[str] = None + code_artifact: Optional[str] = None + time_taken_ms: Optional[int] = None + submission_hash: Optional[str] = None + + required_hash_fields: List[str] = ["task_id", "problem", "domain", "difficulty"] + + def deserialize(self) -> dict: + return { + "task_id": self.task_id, + "steps": self.reasoning_steps or [], + "final_answer": self.final_answer, + "proof_status": self.proof_status, + "proof_artifact": self.proof_artifact, + "code_artifact": self.code_artifact, + "time_taken_ms": self.time_taken_ms, + "submission_hash": self.submission_hash, + } + + def compute_submission_hash(self) -> str: + """Compute SHA-256 hash of steps + final_answer for integrity check.""" + steps_json = json.dumps(self.reasoning_steps or [], sort_keys=True) + payload = f"{self.task_id}:{steps_json}:{self.final_answer or ''}" + return hashlib.sha256(payload.encode()).hexdigest() + + +class HealthCheck(SynapseBase): # type: ignore[valid-type, misc] + """Validator -> Miner: Are you alive and what are your capabilities?""" + + status: Optional[str] = None + supported_domains: Optional[List[str]] = None + model_info: Optional[str] = None + version: Optional[str] = None + + required_hash_fields: List[str] = [] + + def deserialize(self) -> dict: + return { + "status": self.status, + "supported_domains": self.supported_domains, + "model_info": self.model_info, + "version": self.version, + } + + +class TaskResult(SynapseBase): # type: ignore[valid-type, misc] + """Validator -> Miner: Here are your scores for a batch of tasks (informational).""" + + epoch_id: int = 0 + miner_uid: int = 0 + scores: Optional[List[dict]] = None + s_epoch: Optional[float] = None + rank: Optional[int] = None + total_tao: Optional[float] = None + + required_hash_fields: List[str] = ["epoch_id", "miner_uid"] + + def deserialize(self) -> dict: + return { + "epoch_id": self.epoch_id, + "scores": self.scores, + "s_epoch": self.s_epoch, + "rank": self.rank, + } + + +# ────────────────────────────────────────────── +# Utility functions +# ────────────────────────────────────────────── + + +def verify_submission_hash(synapse: ReasoningTask) -> bool: + """Verify the integrity of a miner's submission.""" + if not synapse.submission_hash: + return False + expected = synapse.compute_submission_hash() + return synapse.submission_hash == expected + + +def create_reasoning_task( + task_id: Optional[str] = None, + problem: str = "", + domain: str = "mathematics", + difficulty: int = 5, + timeout_seconds: int = 300, + context: Optional[str] = None, + constraints: Optional[str] = None, +) -> ReasoningTask: + """Factory for creating a ReasoningTask synapse.""" + return ReasoningTask( + task_id=task_id or str(uuid.uuid4()), + problem=problem, + domain=domain, + difficulty=difficulty, + timeout_seconds=timeout_seconds, + context=context, + constraints=constraints, + ) diff --git a/reasonforge/run.py b/reasonforge/run.py index a5569d9..2868478 100644 --- a/reasonforge/run.py +++ b/reasonforge/run.py @@ -9,17 +9,14 @@ import argparse import json -import random -import sys -from typing import List +from typing import List, Optional -from .types import EMISSION_MINER_SHARE, EMISSION_VALIDATOR_SHARE, PEB_K, TASKS_PER_EPOCH, TRAP_RATE from .simulator import ( EpochSimulator, create_default_miners, create_default_validators, ) - +from .types import EMISSION_MINER_SHARE, EMISSION_VALIDATOR_SHARE, PEB_K, TASKS_PER_EPOCH, TRAP_RATE BANNER = r""" ____ _____ @@ -32,7 +29,9 @@ """ -def format_table(headers: List[str], rows: List[List[str]], col_widths: List[int] = None) -> str: +def format_table( + headers: List[str], rows: List[List[str]], col_widths: Optional[List[int]] = None +) -> str: """Format a simple ASCII table.""" if not col_widths: col_widths = [] @@ -60,7 +59,13 @@ def format_table(headers: List[str], rows: List[List[str]], col_widths: List[int return "\n".join(lines) -def run_simulation(epochs: int, emission: float, output: str = None, seed: int = None, verbose: bool = False): +def run_simulation( + epochs: int, + emission: float, + output: Optional[str] = None, + seed: Optional[int] = None, + verbose: bool = False, +): """Run the full multi-epoch simulation.""" print(BANNER) @@ -69,10 +74,14 @@ def run_simulation(epochs: int, emission: float, output: str = None, seed: int = print("=" * 70) print(f" Epochs: {epochs}") print(f" Emission/epoch: {emission} TAO") - print(f" Miner pool: {emission * EMISSION_MINER_SHARE} TAO ({EMISSION_MINER_SHARE*100:.0f}%)") - print(f" Validator pool: {emission * EMISSION_VALIDATOR_SHARE} TAO ({EMISSION_VALIDATOR_SHARE*100:.0f}%)") + print( + f" Miner pool: {emission * EMISSION_MINER_SHARE} TAO ({EMISSION_MINER_SHARE * 100:.0f}%)" + ) + print( + f" Validator pool: {emission * EMISSION_VALIDATOR_SHARE} TAO ({EMISSION_VALIDATOR_SHARE * 100:.0f}%)" + ) print(f" Tasks/epoch: {TASKS_PER_EPOCH}") - print(f" Trap rate: {TRAP_RATE*100:.0f}%") + print(f" Trap rate: {TRAP_RATE * 100:.0f}%") print(f" Top-K PEB: {PEB_K}") print(f" Seed: {seed if seed is not None else 'random'}") print("=" * 70) @@ -101,11 +110,13 @@ def run_simulation(epochs: int, emission: float, output: str = None, seed: int = all_epoch_results.append(EpochSimulator.to_json(result)) # Print epoch header - print(f"\n{'='*70}") + print(f"\n{'=' * 70}") print(f" EPOCH {epoch}/{epochs}") - print(f"{'='*70}") - print(f" Tasks: {result.tasks_processed} | Traps: {result.traps_injected} | " - f"Breakthroughs: {result.breakthroughs} | Avg CMS: {result.avg_cms:.4f}") + print(f"{'=' * 70}") + print( + f" Tasks: {result.tasks_processed} | Traps: {result.traps_injected} | " + f"Breakthroughs: {result.breakthroughs} | Avg CMS: {result.avg_cms:.4f}" + ) print() # Miner leaderboard @@ -121,16 +132,18 @@ def run_simulation(epochs: int, emission: float, output: str = None, seed: int = if m["breakthroughs"] > 0: status += " B" # breakthrough - miner_rows.append([ - str(m["rank"]), - m["name"], - f"{m['s_epoch']:.4f}", - f"{m['peb']:.4f}", - str(m["streak"]), - f"{m['epoch_tao']:.2f}", - f"{m['total_tao']:.2f}", - status.strip(), - ]) + miner_rows.append( + [ + str(m["rank"]), + m["name"], + f"{m['s_epoch']:.4f}", + f"{m['peb']:.4f}", + str(m["streak"]), + f"{m['epoch_tao']:.2f}", + f"{m['total_tao']:.2f}", + status.strip(), + ] + ) print(" MINER LEADERBOARD") print(" " + "-" * 66) @@ -152,15 +165,17 @@ def run_simulation(epochs: int, emission: float, output: str = None, seed: int = else: health = "[BAD]" - val_rows.append([ - v["name"], - str(int(v["stake"])), - f"{v['vas']:.4f}", - f"{v['reputation']:.3f}", - f"{v['epoch_tao']:.2f}", - f"{v['slashed']:.4f}", - health, - ]) + val_rows.append( + [ + v["name"], + str(int(v["stake"])), + f"{v['vas']:.4f}", + f"{v['reputation']:.3f}", + f"{v['epoch_tao']:.2f}", + f"{v['slashed']:.4f}", + health, + ] + ) print(" VALIDATOR SUMMARY") print(" " + "-" * 66) @@ -169,14 +184,14 @@ def run_simulation(epochs: int, emission: float, output: str = None, seed: int = print(f" {line}") if verbose: - print(f"\n [Verbose] Per-task CMS values:") + print("\n [Verbose] Per-task CMS values:") for m in result.miner_results: print(f" {m['name']}: S_epoch={m['s_epoch']:.4f}") # Final standings - print(f"\n\n{'='*70}") + print(f"\n\n{'=' * 70}") print(" FINAL STANDINGS AFTER ALL EPOCHS") - print(f"{'='*70}") + print(f"{'=' * 70}") # Get final miner standings final_miners = sorted( @@ -187,8 +202,10 @@ def run_simulation(epochs: int, emission: float, output: str = None, seed: int = print("\n TOP MINERS (by Total TAO):") for i, m in enumerate(final_miners[:5], 1): star = " *" if i <= 3 else "" - print(f" {i}. {m['name']:20s} Total: {m['total_tao']:8.2f} TAO " - f"Streak: {m['streak']:2d} PEB: {m['peb']:.4f}{star}") + print( + f" {i}. {m['name']:20s} Total: {m['total_tao']:8.2f} TAO " + f"Streak: {m['streak']:2d} PEB: {m['peb']:.4f}{star}" + ) # Final validator standings print("\n VALIDATORS:") @@ -200,11 +217,13 @@ def run_simulation(epochs: int, emission: float, output: str = None, seed: int = indicator = "[WARN]" else: indicator = "[BAD]" - print(f" {v['name']:12s} Stake: {int(v['stake']):5d} VAS: {v['vas']:.4f} " - f"Total: {v['total_tao']:.2f} TAO Slashed: {v['slashed']:.4f} {indicator}") + print( + f" {v['name']:12s} Stake: {int(v['stake']):5d} VAS: {v['vas']:.4f} " + f"Total: {v['total_tao']:.2f} TAO Slashed: {v['slashed']:.4f} {indicator}" + ) # Key observations - print(f"\n KEY OBSERVATIONS:") + print("\n KEY OBSERVATIONS:") # Find adversarial miners bottom = final_miners[-2:] for m in bottom: @@ -222,7 +241,7 @@ def run_simulation(epochs: int, emission: float, output: str = None, seed: int = names = ", ".join(m["name"] for m in streak_leaders) print(f" * Maintained full streak: {names}") - print(f"\n{'='*70}") + print(f"\n{'=' * 70}") # Save output if output: @@ -249,7 +268,9 @@ def main(): description="ReasonForge - Decentralized Verifiable Reasoning Simulator" ) parser.add_argument("--epochs", type=int, default=5, help="Number of epochs (default: 5)") - parser.add_argument("--emission", type=float, default=100.0, help="TAO per epoch (default: 100.0)") + parser.add_argument( + "--emission", type=float, default=100.0, help="TAO per epoch (default: 100.0)" + ) parser.add_argument("--output", type=str, default=None, help="Save JSON results to file") parser.add_argument("--seed", type=int, default=None, help="Random seed for reproducibility") parser.add_argument("--verbose", action="store_true", help="Show per-task details") diff --git a/reasonforge/security/__init__.py b/reasonforge/security/__init__.py new file mode 100644 index 0000000..5c960b4 --- /dev/null +++ b/reasonforge/security/__init__.py @@ -0,0 +1,11 @@ +""" +ReasonForge - Security Utilities + +Input sanitization, rate limiting, and anomaly detection. +""" + +from .anomaly import AnomalyDetector +from .rate_guard import RateGuard +from .sanitizer import InputSanitizer + +__all__ = ["InputSanitizer", "RateGuard", "AnomalyDetector"] diff --git a/reasonforge/security/anomaly.py b/reasonforge/security/anomaly.py new file mode 100644 index 0000000..d0806d7 --- /dev/null +++ b/reasonforge/security/anomaly.py @@ -0,0 +1,168 @@ +""" +ReasonForge - Anomaly Detection + +Detect suspicious miner behavior patterns including timing anomalies, +score manipulation, and collusion. +""" + +from __future__ import annotations + +import logging +import statistics +from typing import List, Tuple + +logger = logging.getLogger("reasonforge.security.anomaly") + + +class AnomalyDetector: + """Detect suspicious miner behavior patterns.""" + + def __init__( + self, + min_solve_time_per_difficulty: int = 500, # ms per difficulty level + score_variance_threshold: float = 0.001, + collusion_similarity_threshold: float = 0.98, + ): + self.min_solve_time_per_difficulty = min_solve_time_per_difficulty + self.score_variance_threshold = score_variance_threshold + self.collusion_threshold = collusion_similarity_threshold + + def check_timing_anomaly(self, time_ms: int, difficulty: int) -> bool: + """ + Flag if solve time is unrealistically fast for difficulty. + + Returns: + True if anomalous, False if normal. + """ + min_expected = difficulty * self.min_solve_time_per_difficulty + if time_ms < min_expected: + logger.warning( + "Timing anomaly: %dms for difficulty %d (min expected: %dms)", + time_ms, + difficulty, + min_expected, + ) + return True + return False + + def check_score_manipulation(self, cms_history: List[float]) -> bool: + """ + Flag if CMS scores are suspiciously consistent (potential gaming). + + Returns: + True if suspicious, False if normal. + """ + if len(cms_history) < 5: + return False + + try: + variance = statistics.variance(cms_history) + if variance < self.score_variance_threshold: + logger.warning( + "Score manipulation suspected: variance=%.6f (threshold=%.6f)", + variance, + self.score_variance_threshold, + ) + return True + except statistics.StatisticsError: + pass + + return False + + def check_sudden_improvement(self, recent_scores: List[float], historical_avg: float) -> bool: + """ + Flag if a miner suddenly improves dramatically (potential identity swap). + + Returns: + True if suspicious, False if normal. + """ + if not recent_scores or historical_avg <= 0: + return False + + recent_avg = sum(recent_scores) / len(recent_scores) + improvement_ratio = recent_avg / historical_avg + + if improvement_ratio > 2.0: + logger.warning( + "Sudden improvement detected: %.2f -> %.2f (%.1fx)", + historical_avg, + recent_avg, + improvement_ratio, + ) + return True + + return False + + def check_collusion( + self, + submissions: List[dict], + similarity_fn=None, + ) -> List[Tuple[int, int, float]]: + """ + Detect colluding miners with near-identical submissions. + + Args: + submissions: List of {uid, text} dicts. + similarity_fn: Function(text_a, text_b) -> float. + + Returns: + List of (uid_a, uid_b, similarity) tuples for flagged pairs. + """ + flagged = [] + + if similarity_fn is None: + # Basic text similarity fallback + similarity_fn = self._jaccard_similarity + + for i in range(len(submissions)): + for j in range(i + 1, len(submissions)): + text_a = submissions[i].get("text", "") + text_b = submissions[j].get("text", "") + + if not text_a or not text_b: + continue + + sim = similarity_fn(text_a, text_b) + if sim > self.collusion_threshold: + uid_a = submissions[i].get("uid", i) + uid_b = submissions[j].get("uid", j) + flagged.append((uid_a, uid_b, sim)) + logger.warning( + "Collusion detected: UID %d and UID %d (similarity=%.4f)", + uid_a, + uid_b, + sim, + ) + + return flagged + + @staticmethod + def _jaccard_similarity(text_a: str, text_b: str) -> float: + """Simple Jaccard similarity as fallback.""" + words_a = set(text_a.lower().split()) + words_b = set(text_b.lower().split()) + if not words_a and not words_b: + return 0.0 + intersection = len(words_a & words_b) + union = len(words_a | words_b) + return intersection / union if union > 0 else 0.0 + + def get_anomaly_report( + self, + uid: int, + time_ms: int, + difficulty: int, + cms_history: List[float], + ) -> dict: + """Generate a full anomaly report for a miner.""" + return { + "uid": uid, + "timing_anomaly": self.check_timing_anomaly(time_ms, difficulty), + "score_manipulation": self.check_score_manipulation(cms_history), + "flags_count": sum( + [ + self.check_timing_anomaly(time_ms, difficulty), + self.check_score_manipulation(cms_history), + ] + ), + } diff --git a/reasonforge/security/rate_guard.py b/reasonforge/security/rate_guard.py new file mode 100644 index 0000000..7d52f02 --- /dev/null +++ b/reasonforge/security/rate_guard.py @@ -0,0 +1,55 @@ +""" +ReasonForge - Rate Guard + +Per-UID rate limiting to prevent DoS attacks from miners/validators. +""" + +from __future__ import annotations + +import logging +import time +from collections import defaultdict +from typing import Dict, List + +logger = logging.getLogger("reasonforge.security.rate_guard") + + +class RateGuard: + """Per-UID rate limiting to prevent DoS.""" + + def __init__(self, max_requests_per_minute: int = 10): + self.max_rpm = max_requests_per_minute + self._requests: Dict[int, List[float]] = defaultdict(list) + + def check(self, uid: int) -> bool: + """ + Check if a request from this UID is allowed. + + Returns: + True if request is allowed, False if rate limited. + """ + now = time.time() + # Clean old entries + self._requests[uid] = [t for t in self._requests[uid] if now - t < 60] + + if len(self._requests[uid]) >= self.max_rpm: + logger.warning("Rate limited UID %d (%d requests/min)", uid, len(self._requests[uid])) + return False + + self._requests[uid].append(now) + return True + + def get_remaining(self, uid: int) -> int: + """Get remaining requests for a UID in the current window.""" + now = time.time() + recent = [t for t in self._requests.get(uid, []) if now - t < 60] + return max(0, self.max_rpm - len(recent)) + + def reset(self, uid: int) -> None: + """Reset rate limit for a specific UID.""" + if uid in self._requests: + del self._requests[uid] + + def reset_all(self) -> None: + """Reset all rate limits.""" + self._requests.clear() diff --git a/reasonforge/security/sanitizer.py b/reasonforge/security/sanitizer.py new file mode 100644 index 0000000..3fd3ffc --- /dev/null +++ b/reasonforge/security/sanitizer.py @@ -0,0 +1,103 @@ +""" +ReasonForge - Input Sanitizer + +Validate and sanitize all inputs from miners and external API. +""" + +from __future__ import annotations + +import base64 +import logging +import re + +logger = logging.getLogger("reasonforge.security.sanitizer") + + +class InputSanitizer: + """Validate and sanitize all inputs from miners and external API.""" + + MAX_STEP_LENGTH = 10_000 # chars per reasoning step + MAX_STEPS = 50 # max steps per submission + MAX_ANSWER_LENGTH = 50_000 # chars + MAX_PROOF_SIZE = 1_000_000 # bytes (1MB) + MAX_CODE_SIZE = 500_000 # bytes (500KB) + MAX_PROBLEM_LENGTH = 10_000 # chars + + @staticmethod + def sanitize_submission(response) -> None: + """ + Validate and sanitize all miner-provided fields in-place. + Truncates oversized fields, strips injection attempts. + """ + # 1. Truncate reasoning steps + if hasattr(response, "reasoning_steps") and response.reasoning_steps: + if len(response.reasoning_steps) > InputSanitizer.MAX_STEPS: + response.reasoning_steps = response.reasoning_steps[: InputSanitizer.MAX_STEPS] + logger.warning("Truncated steps from submission") + + for step in response.reasoning_steps: + if isinstance(step, dict): + reasoning = step.get("reasoning", "") + if len(reasoning) > InputSanitizer.MAX_STEP_LENGTH: + step["reasoning"] = reasoning[: InputSanitizer.MAX_STEP_LENGTH] + + evidence = step.get("evidence", "") + if len(evidence) > InputSanitizer.MAX_STEP_LENGTH: + step["evidence"] = evidence[: InputSanitizer.MAX_STEP_LENGTH] + + # Sanitize confidence to valid range + conf = step.get("confidence", 0.0) + step["confidence"] = max(0.0, min(1.0, float(conf))) + + # 2. Truncate final answer + if hasattr(response, "final_answer") and response.final_answer: + if len(response.final_answer) > InputSanitizer.MAX_ANSWER_LENGTH: + response.final_answer = response.final_answer[: InputSanitizer.MAX_ANSWER_LENGTH] + + # 3. Validate proof artifact size + if hasattr(response, "proof_artifact") and response.proof_artifact: + try: + decoded = base64.b64decode(response.proof_artifact) + if len(decoded) > InputSanitizer.MAX_PROOF_SIZE: + response.proof_artifact = None + logger.warning("Removed oversized proof artifact") + except Exception: + response.proof_artifact = None + + # 4. Validate code artifact size + if hasattr(response, "code_artifact") and response.code_artifact: + try: + decoded = base64.b64decode(response.code_artifact) + if len(decoded) > InputSanitizer.MAX_CODE_SIZE: + response.code_artifact = None + logger.warning("Removed oversized code artifact") + except Exception: + response.code_artifact = None + + @staticmethod + def sanitize_problem(problem: str) -> str: + """Sanitize a problem statement from external API.""" + if not problem: + return "" + + # Truncate + problem = problem[: InputSanitizer.MAX_PROBLEM_LENGTH] + + # Remove potential injection patterns + # Remove script tags + problem = re.sub(r"]*>.*?", "", problem, flags=re.DOTALL | re.IGNORECASE) + # Remove HTML tags + problem = re.sub(r"<[^>]+>", "", problem) + + return problem.strip() + + @staticmethod + def validate_domain(domain: str) -> bool: + """Validate that a domain is one of the allowed values.""" + valid_domains = {"mathematics", "code", "scientific", "strategic", "causal", "ethical"} + return domain.lower() in valid_domains + + @staticmethod + def validate_difficulty(difficulty: int) -> bool: + """Validate difficulty is in range.""" + return 1 <= difficulty <= 10 diff --git a/reasonforge/simulator.py b/reasonforge/simulator.py index 1cb7951..cc6f778 100644 --- a/reasonforge/simulator.py +++ b/reasonforge/simulator.py @@ -7,23 +7,22 @@ from __future__ import annotations -import json import random from typing import Any, Dict, List, Optional, Tuple +from .engine import ScoringEngine +from .plagiarism import PlagiarismDetector +from .task_generator import TaskGenerator from .types import ( BREAKTHROUGH_THRESHOLD, - DIFFICULTY_MULTIPLIER, DOMAIN_CHECK_WEIGHTS, EMISSION_MINER_SHARE, EMISSION_VALIDATOR_SHARE, - OBJECTIVE_WEIGHT, - CONSENSUS_WEIGHT, PEB_K, TASKS_PER_EPOCH, VALIDATORS_PER_TASK, - Domain, DimensionScores, + Domain, EpochResult, MinerState, MinerSubmission, @@ -31,20 +30,16 @@ Task, ValidatorState, ) -from .engine import ScoringEngine -from .task_generator import TaskGenerator -from .plagiarism import PlagiarismDetector - # ────────────────────────────────────────────── # Miner Profiles # ────────────────────────────────────────────── MINER_TIERS = { - "elite": {"q": 0.88, "a": 0.90, "n": 0.80, "e": 0.85, "var": 0.06}, - "strong": {"q": 0.78, "a": 0.80, "n": 0.70, "e": 0.75, "var": 0.08}, - "mid": {"q": 0.65, "a": 0.68, "n": 0.55, "e": 0.65, "var": 0.10}, - "weak": {"q": 0.45, "a": 0.50, "n": 0.40, "e": 0.55, "var": 0.12}, + "elite": {"q": 0.88, "a": 0.90, "n": 0.80, "e": 0.85, "var": 0.06}, + "strong": {"q": 0.78, "a": 0.80, "n": 0.70, "e": 0.75, "var": 0.08}, + "mid": {"q": 0.65, "a": 0.68, "n": 0.55, "e": 0.65, "var": 0.10}, + "weak": {"q": 0.45, "a": 0.50, "n": 0.40, "e": 0.55, "var": 0.12}, "adversarial": {"q": 0.20, "a": 0.15, "n": 0.10, "e": 0.30, "var": 0.15}, } @@ -52,7 +47,7 @@ class MinerProfile: """Simulated miner with capability profile based on tier.""" - def __init__(self, miner_id: str, name: str, tier: str, seed: int = None): + def __init__(self, miner_id: str, name: str, tier: str, seed: Optional[int] = None): self.miner_id = miner_id self.name = name self.tier = tier @@ -82,10 +77,18 @@ def solve_task(self, task: Task) -> Tuple[DimensionScores, MinerSubmission]: domain_bonus = self.domain_bonuses.get(task.domain, 0.0) diff_penalty = (task.difficulty - 5) * 0.015 - q = self._clamp(self.base_quality + domain_bonus - diff_penalty + self.rng.gauss(0, self.variance)) - a = self._clamp(self.base_accuracy + domain_bonus - diff_penalty + self.rng.gauss(0, self.variance)) - n = self._clamp(self.base_novelty + domain_bonus - diff_penalty + self.rng.gauss(0, self.variance)) - e = self._clamp(self.base_efficiency + domain_bonus - diff_penalty + self.rng.gauss(0, self.variance)) + q = self._clamp( + self.base_quality + domain_bonus - diff_penalty + self.rng.gauss(0, self.variance) + ) + a = self._clamp( + self.base_accuracy + domain_bonus - diff_penalty + self.rng.gauss(0, self.variance) + ) + n = self._clamp( + self.base_novelty + domain_bonus - diff_penalty + self.rng.gauss(0, self.variance) + ) + e = self._clamp( + self.base_efficiency + domain_bonus - diff_penalty + self.rng.gauss(0, self.variance) + ) scores = DimensionScores(quality=q, accuracy=a, novelty=n, efficiency=e) @@ -93,12 +96,14 @@ def solve_task(self, task: Task) -> Tuple[DimensionScores, MinerSubmission]: num_steps = self.rng.randint(2, 5) steps = [] for i in range(num_steps): - steps.append(ReasoningStep( - step_id=i + 1, - reasoning=f"Step {i+1}: {self.name} applies reasoning for {task.domain.value} task", - evidence=f"Evidence from analysis of {task.problem[:50]}...", - confidence=self._clamp(scores.cms + self.rng.gauss(0, 0.05)), - )) + steps.append( + ReasoningStep( + step_id=i + 1, + reasoning=f"Step {i + 1}: {self.name} applies reasoning for {task.domain.value} task", + evidence=f"Evidence from analysis of {task.problem[:50]}...", + confidence=self._clamp(scores.cms + self.rng.gauss(0, 0.05)), + ) + ) submission = MinerSubmission( task_id=task.task_id, @@ -117,9 +122,9 @@ def solve_task(self, task: Task) -> Tuple[DimensionScores, MinerSubmission]: # ────────────────────────────────────────────── VALIDATOR_PROFILES = { - "honest": {"noise": 0.03, "bias": 0.0}, - "good": {"noise": 0.06, "bias": 0.0}, - "lazy": {"noise": 0.15, "bias": -0.10}, + "honest": {"noise": 0.03, "bias": 0.0}, + "good": {"noise": 0.06, "bias": 0.0}, + "lazy": {"noise": 0.15, "bias": -0.10}, "malicious": {"noise": 0.25, "bias": +0.20}, } @@ -127,7 +132,9 @@ def solve_task(self, task: Task) -> Tuple[DimensionScores, MinerSubmission]: class ValidatorProfile: """Simulated validator with accuracy profile.""" - def __init__(self, validator_id: str, name: str, stake: float, accuracy: str, seed: int = None): + def __init__( + self, validator_id: str, name: str, stake: float, accuracy: str, seed: Optional[int] = None + ): self.validator_id = validator_id self.name = name self.stake = stake @@ -173,7 +180,9 @@ def evaluate(self, true_score: float) -> float: ] -def create_default_miners(seed: int = None) -> Tuple[List[MinerProfile], List[MinerState]]: +def create_default_miners( + seed: Optional[int] = None, +) -> Tuple[List[MinerProfile], List[MinerState]]: """Create the default roster of 12 miners.""" profiles = [] states = [] @@ -184,7 +193,9 @@ def create_default_miners(seed: int = None) -> Tuple[List[MinerProfile], List[Mi return profiles, states -def create_default_validators(seed: int = None) -> Tuple[List[ValidatorProfile], List[ValidatorState]]: +def create_default_validators( + seed: Optional[int] = None, +) -> Tuple[List[ValidatorProfile], List[ValidatorState]]: """Create the default roster of 6 validators.""" profiles = [] states = [] @@ -199,6 +210,7 @@ def create_default_validators(seed: int = None) -> Tuple[List[ValidatorProfile], # Epoch Simulator # ────────────────────────────────────────────── + class EpochSimulator: """ Main simulation runner. Executes the full epoch loop: @@ -213,7 +225,7 @@ def __init__( validator_states: Optional[List[ValidatorState]] = None, epoch_id: int = 1, total_emission: float = 100.0, - seed: int = None, + seed: Optional[int] = None, ): self.miner_profiles = {mp.miner_id: mp for mp in miner_profiles} self.validator_profiles = {vp.validator_id: vp for vp in validator_profiles} @@ -253,12 +265,8 @@ def run_epoch(self) -> EpochResult: vs.slashed_amount = 0.0 # Track validator deviations for VAS computation - validator_scores_given: Dict[str, List[float]] = { - vid: [] for vid in self.validator_states - } - validator_consensus_ref: Dict[str, List[float]] = { - vid: [] for vid in self.validator_states - } + validator_scores_given: Dict[str, List[float]] = {vid: [] for vid in self.validator_states} + validator_consensus_ref: Dict[str, List[float]] = {vid: [] for vid in self.validator_states} all_cms_values: List[float] = [] total_breakthroughs = 0 @@ -284,14 +292,21 @@ def run_epoch(self) -> EpochResult: domain_weights = DOMAIN_CHECK_WEIGHTS.get(task.domain, {"logic": 1.0}) # Simulate check results from dimension scores checks = {} - dim_values = [dim_scores.quality, dim_scores.accuracy, dim_scores.novelty, dim_scores.efficiency] + dim_values = [ + dim_scores.quality, + dim_scores.accuracy, + dim_scores.novelty, + dim_scores.efficiency, + ] for i, key in enumerate(domain_weights.keys()): checks[key] = dim_values[i % len(dim_values)] o_score = ScoringEngine.compute_objective_score(checks, domain_weights) # 3c-d. Assign validators and evaluate validator_ids = list(self.validator_states.keys()) - assigned = self.rng.sample(validator_ids, min(VALIDATORS_PER_TASK, len(validator_ids))) + assigned = self.rng.sample( + validator_ids, min(VALIDATORS_PER_TASK, len(validator_ids)) + ) val_scores_stakes: List[Tuple[float, float]] = [] for vid in assigned: @@ -362,9 +377,7 @@ def run_epoch(self) -> EpochResult: ms.s_epoch = 0.0 # 5. Rank miners by S_epoch descending - sorted_miners = sorted( - self.miner_states.values(), key=lambda m: m.s_epoch, reverse=True - ) + sorted_miners = sorted(self.miner_states.values(), key=lambda m: m.s_epoch, reverse=True) for rank, ms in enumerate(sorted_miners, 1): ms.rank = rank # Update streak: increment if in top-K, else reset @@ -415,31 +428,35 @@ def run_epoch(self) -> EpochResult: # 12. Build and return EpochResult miner_results = [] for ms in sorted_miners: - miner_results.append({ - "rank": ms.rank, - "miner_id": ms.miner_id, - "name": ms.name, - "s_epoch": round(ms.s_epoch, 6), - "peb": round(ms.peb, 6), - "streak": ms.streak, - "epoch_tao": ms.epoch_tao, - "total_tao": round(ms.total_tao_earned, 6), - "trap_penalty": round(ms.trap_penalty, 6), - "breakthroughs": ms.breakthroughs, - }) + miner_results.append( + { + "rank": ms.rank, + "miner_id": ms.miner_id, + "name": ms.name, + "s_epoch": round(ms.s_epoch, 6), + "peb": round(ms.peb, 6), + "streak": ms.streak, + "epoch_tao": ms.epoch_tao, + "total_tao": round(ms.total_tao_earned, 6), + "trap_penalty": round(ms.trap_penalty, 6), + "breakthroughs": ms.breakthroughs, + } + ) validator_results = [] for vs in val_list: - validator_results.append({ - "validator_id": vs.validator_id, - "name": vs.name, - "stake": vs.stake, - "vas": round(vs.current_vas, 6), - "reputation": round(vs.reputation_multiplier, 6), - "epoch_tao": vs.epoch_tao, - "total_tao": round(vs.total_tao_earned, 6), - "slashed": round(vs.slashed_amount, 6), - }) + validator_results.append( + { + "validator_id": vs.validator_id, + "name": vs.name, + "stake": vs.stake, + "vas": round(vs.current_vas, 6), + "reputation": round(vs.reputation_multiplier, 6), + "epoch_tao": vs.epoch_tao, + "total_tao": round(vs.total_tao_earned, 6), + "slashed": round(vs.slashed_amount, 6), + } + ) avg_cms = sum(all_cms_values) / len(all_cms_values) if all_cms_values else 0.0 diff --git a/reasonforge/state/__init__.py b/reasonforge/state/__init__.py new file mode 100644 index 0000000..8092431 --- /dev/null +++ b/reasonforge/state/__init__.py @@ -0,0 +1,13 @@ +""" +ReasonForge - State Persistence + +SQLite-backed state storage for epoch data, checkpoints, and migrations. +""" + +from __future__ import annotations + +from .checkpoint import CheckpointManager +from .database import StateDatabase +from .migrations import MigrationManager + +__all__ = ["CheckpointManager", "MigrationManager", "StateDatabase"] diff --git a/reasonforge/state/checkpoint.py b/reasonforge/state/checkpoint.py new file mode 100644 index 0000000..9c5abc9 --- /dev/null +++ b/reasonforge/state/checkpoint.py @@ -0,0 +1,32 @@ +""" +ReasonForge - Checkpoint Manager + +Higher-level checkpoint save/restore built on top of StateDatabase. +""" + +from __future__ import annotations + +import logging +from typing import Any, Dict, Optional + +from .database import StateDatabase + +logger = logging.getLogger("reasonforge.state.checkpoint") + + +class CheckpointManager: + """Manage checkpoint save/restore operations.""" + + def __init__(self, db: StateDatabase, keep_last: int = 10): + self.db = db + self.keep_last = keep_last + + def save(self, epoch_id: int, state: Dict[str, Any]) -> None: + """Save a checkpoint and optionally prune old ones.""" + self.db.save_checkpoint(state, epoch_id=epoch_id) + self.db.prune_checkpoints(keep_last=self.keep_last) + logger.info("Checkpoint saved for epoch %d", epoch_id) + + def load_latest(self) -> Optional[Dict[str, Any]]: + """Load the most recent checkpoint.""" + return self.db.load_latest_checkpoint() diff --git a/reasonforge/state/database.py b/reasonforge/state/database.py new file mode 100644 index 0000000..341bbe1 --- /dev/null +++ b/reasonforge/state/database.py @@ -0,0 +1,276 @@ +""" +ReasonForge - State Database + +SQLite-backed persistent storage for miner epochs, task results, +submissions, checkpoints, and API keys. +""" + +from __future__ import annotations + +import json +import logging +import sqlite3 +import time +from typing import Any, Dict, List, Optional + +logger = logging.getLogger("reasonforge.state.database") + + +class StateDatabase: + """SQLite state persistence for validators and miners.""" + + def __init__(self, db_path: str): + self.db_path = db_path + self.conn = sqlite3.connect(db_path) + self.conn.row_factory = sqlite3.Row + self.conn.execute("PRAGMA journal_mode=WAL") + self._create_tables() + + def _create_tables(self) -> None: + """Create all required tables if they don't exist.""" + self.conn.executescript(""" + CREATE TABLE IF NOT EXISTS miner_epochs ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + epoch_id INTEGER NOT NULL, + miner_uid INTEGER NOT NULL, + s_epoch REAL NOT NULL DEFAULT 0.0, + peb REAL NOT NULL DEFAULT 0.0, + rank INTEGER NOT NULL DEFAULT 0, + streak INTEGER NOT NULL DEFAULT 0, + tao_earned REAL NOT NULL DEFAULT 0.0, + created_at REAL NOT NULL DEFAULT (strftime('%s', 'now')), + UNIQUE(epoch_id, miner_uid) + ); + + CREATE TABLE IF NOT EXISTS task_results ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + task_id TEXT NOT NULL, + epoch_id INTEGER NOT NULL, + domain TEXT NOT NULL, + difficulty INTEGER NOT NULL DEFAULT 5, + is_trap INTEGER NOT NULL DEFAULT 0, + avg_cms REAL NOT NULL DEFAULT 0.0, + best_miner_uid INTEGER, + created_at REAL NOT NULL DEFAULT (strftime('%s', 'now')) + ); + + CREATE TABLE IF NOT EXISTS submissions ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + submission_id TEXT NOT NULL UNIQUE, + task_id TEXT NOT NULL, + miner_uid INTEGER NOT NULL, + cms REAL NOT NULL DEFAULT 0.0, + quality REAL NOT NULL DEFAULT 0.0, + accuracy REAL NOT NULL DEFAULT 0.0, + novelty REAL NOT NULL DEFAULT 0.0, + efficiency REAL NOT NULL DEFAULT 0.0, + submission_hash TEXT NOT NULL DEFAULT '', + created_at REAL NOT NULL DEFAULT (strftime('%s', 'now')) + ); + + CREATE TABLE IF NOT EXISTS checkpoints ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + epoch_id INTEGER NOT NULL, + state_json TEXT NOT NULL, + created_at REAL NOT NULL DEFAULT (strftime('%s', 'now')) + ); + + CREATE TABLE IF NOT EXISTS api_keys ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + key_id TEXT NOT NULL, + api_key TEXT NOT NULL UNIQUE, + owner TEXT NOT NULL DEFAULT '', + tier TEXT NOT NULL DEFAULT 'free', + request_limit INTEGER NOT NULL DEFAULT 100, + requests_used INTEGER NOT NULL DEFAULT 0, + created_at REAL NOT NULL DEFAULT (strftime('%s', 'now')) + ); + + CREATE INDEX IF NOT EXISTS idx_miner_epochs_uid ON miner_epochs(miner_uid); + CREATE INDEX IF NOT EXISTS idx_miner_epochs_epoch ON miner_epochs(epoch_id); + CREATE INDEX IF NOT EXISTS idx_task_results_epoch ON task_results(epoch_id); + CREATE INDEX IF NOT EXISTS idx_checkpoints_epoch ON checkpoints(epoch_id); + CREATE INDEX IF NOT EXISTS idx_api_keys_key ON api_keys(api_key); + """) + self.conn.commit() + + # ── Miner Epoch Data ──────────────────────────── + + def save_miner_epoch( + self, + epoch_id: int, + miner_uid: int, + s_epoch: float, + peb: float, + rank: int, + streak: int, + tao_earned: float = 0.0, + ) -> None: + """Save miner performance for an epoch.""" + self.conn.execute( + """INSERT OR REPLACE INTO miner_epochs + (epoch_id, miner_uid, s_epoch, peb, rank, streak, tao_earned, created_at) + VALUES (?, ?, ?, ?, ?, ?, ?, ?)""", + (epoch_id, miner_uid, s_epoch, peb, rank, streak, tao_earned, time.time()), + ) + self.conn.commit() + + def get_miner_history(self, miner_uid: int) -> List[Dict[str, Any]]: + """Get all epoch records for a miner.""" + rows = self.conn.execute( + "SELECT * FROM miner_epochs WHERE miner_uid = ? ORDER BY epoch_id", + (miner_uid,), + ).fetchall() + return [dict(row) for row in rows] + + def get_epoch_leaderboard(self, epoch_id: int) -> List[Dict[str, Any]]: + """Get leaderboard for an epoch, sorted by s_epoch descending.""" + rows = self.conn.execute( + "SELECT * FROM miner_epochs WHERE epoch_id = ? ORDER BY s_epoch DESC", + (epoch_id,), + ).fetchall() + return [dict(row) for row in rows] + + # ── Task Results ──────────────────────────────── + + def save_task_result( + self, + task_id: str, + epoch_id: int, + domain: str, + difficulty: int, + is_trap: bool, + avg_cms: float, + best_miner_uid: Optional[int] = None, + ) -> None: + """Save a task result.""" + self.conn.execute( + """INSERT INTO task_results + (task_id, epoch_id, domain, difficulty, is_trap, avg_cms, best_miner_uid, created_at) + VALUES (?, ?, ?, ?, ?, ?, ?, ?)""", + ( + task_id, + epoch_id, + domain, + difficulty, + int(is_trap), + avg_cms, + best_miner_uid, + time.time(), + ), + ) + self.conn.commit() + + # ── Submissions ───────────────────────────────── + + def save_submission( + self, + submission_id: str, + task_id: str, + miner_uid: int, + cms: float, + quality: float, + accuracy: float, + novelty: float, + efficiency: float, + submission_hash: str = "", + ) -> None: + """Save a miner submission.""" + self.conn.execute( + """INSERT INTO submissions + (submission_id, task_id, miner_uid, cms, quality, accuracy, novelty, efficiency, + submission_hash, created_at) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""", + ( + submission_id, + task_id, + miner_uid, + cms, + quality, + accuracy, + novelty, + efficiency, + submission_hash, + time.time(), + ), + ) + self.conn.commit() + + # ── Checkpoints ───────────────────────────────── + + def save_checkpoint(self, state: Dict[str, Any], epoch_id: int) -> None: + """Save a checkpoint as JSON.""" + self.conn.execute( + "INSERT INTO checkpoints (epoch_id, state_json, created_at) VALUES (?, ?, ?)", + (epoch_id, json.dumps(state), time.time()), + ) + self.conn.commit() + + def load_latest_checkpoint(self) -> Optional[Dict[str, Any]]: + """Load the most recent checkpoint.""" + row = self.conn.execute( + "SELECT state_json FROM checkpoints ORDER BY epoch_id DESC, id DESC LIMIT 1" + ).fetchone() + if row is None: + return None + return json.loads(row["state_json"]) + + def prune_checkpoints(self, keep_last: int = 5) -> None: + """Delete old checkpoints, keeping the N most recent.""" + self.conn.execute( + """DELETE FROM checkpoints WHERE id NOT IN ( + SELECT id FROM checkpoints ORDER BY epoch_id DESC, id DESC LIMIT ? + )""", + (keep_last,), + ) + self.conn.commit() + + # ── API Keys ──────────────────────────────────── + + def save_api_key( + self, + key_id: str, + api_key: str, + owner: str, + tier: str = "free", + request_limit: int = 100, + ) -> None: + """Save an API key.""" + self.conn.execute( + """INSERT OR REPLACE INTO api_keys + (key_id, api_key, owner, tier, request_limit, requests_used, created_at) + VALUES (?, ?, ?, ?, ?, 0, ?)""", + (key_id, api_key, owner, tier, request_limit, time.time()), + ) + self.conn.commit() + + def get_api_key(self, api_key: str) -> Optional[Dict[str, Any]]: + """Look up an API key.""" + row = self.conn.execute("SELECT * FROM api_keys WHERE api_key = ?", (api_key,)).fetchone() + if row is None: + return None + return dict(row) + + def increment_api_usage(self, api_key: str) -> None: + """Increment the usage counter for an API key.""" + self.conn.execute( + "UPDATE api_keys SET requests_used = requests_used + 1 WHERE api_key = ?", + (api_key,), + ) + self.conn.commit() + + # ── Stats ─────────────────────────────────────── + + def get_stats(self) -> Dict[str, int]: + """Get row counts for all tables.""" + stats: Dict[str, int] = {} + for table in ("miner_epochs", "task_results", "submissions", "checkpoints", "api_keys"): + row = self.conn.execute(f"SELECT COUNT(*) as cnt FROM {table}").fetchone() # noqa: S608 + stats[table] = row["cnt"] if row else 0 + return stats + + # ── Lifecycle ─────────────────────────────────── + + def close(self) -> None: + """Close the database connection.""" + self.conn.close() diff --git a/reasonforge/state/migrations.py b/reasonforge/state/migrations.py new file mode 100644 index 0000000..906144b --- /dev/null +++ b/reasonforge/state/migrations.py @@ -0,0 +1,132 @@ +""" +ReasonForge - Database Migrations + +Simple versioned migration system for the state database. +""" + +from __future__ import annotations + +import logging +import sqlite3 +from typing import List, Tuple + +logger = logging.getLogger("reasonforge.state.migrations") + +# Each migration is (version, description, SQL) +MIGRATIONS: List[Tuple[int, str, str]] = [ + ( + 1, + "Initial schema — miner_epochs, task_results, submissions, checkpoints, api_keys", + """ + CREATE TABLE IF NOT EXISTS miner_epochs ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + epoch_id INTEGER NOT NULL, + miner_uid INTEGER NOT NULL, + s_epoch REAL NOT NULL DEFAULT 0.0, + peb REAL NOT NULL DEFAULT 0.0, + rank INTEGER NOT NULL DEFAULT 0, + streak INTEGER NOT NULL DEFAULT 0, + tao_earned REAL NOT NULL DEFAULT 0.0, + created_at REAL NOT NULL DEFAULT 0, + UNIQUE(epoch_id, miner_uid) + ); + + CREATE TABLE IF NOT EXISTS task_results ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + task_id TEXT NOT NULL, + epoch_id INTEGER NOT NULL, + domain TEXT NOT NULL, + difficulty INTEGER NOT NULL DEFAULT 5, + is_trap INTEGER NOT NULL DEFAULT 0, + avg_cms REAL NOT NULL DEFAULT 0.0, + best_miner_uid INTEGER, + created_at REAL NOT NULL DEFAULT 0 + ); + + CREATE TABLE IF NOT EXISTS submissions ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + submission_id TEXT NOT NULL UNIQUE, + task_id TEXT NOT NULL, + miner_uid INTEGER NOT NULL, + cms REAL NOT NULL DEFAULT 0.0, + quality REAL NOT NULL DEFAULT 0.0, + accuracy REAL NOT NULL DEFAULT 0.0, + novelty REAL NOT NULL DEFAULT 0.0, + efficiency REAL NOT NULL DEFAULT 0.0, + submission_hash TEXT NOT NULL DEFAULT '', + created_at REAL NOT NULL DEFAULT 0 + ); + + CREATE TABLE IF NOT EXISTS checkpoints ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + epoch_id INTEGER NOT NULL, + state_json TEXT NOT NULL, + created_at REAL NOT NULL DEFAULT 0 + ); + + CREATE TABLE IF NOT EXISTS api_keys ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + key_id TEXT NOT NULL, + api_key TEXT NOT NULL UNIQUE, + owner TEXT NOT NULL DEFAULT '', + tier TEXT NOT NULL DEFAULT 'free', + request_limit INTEGER NOT NULL DEFAULT 100, + requests_used INTEGER NOT NULL DEFAULT 0, + created_at REAL NOT NULL DEFAULT 0 + ); + """, + ), + ( + 2, + "Add indexes for performance", + """ + CREATE INDEX IF NOT EXISTS idx_miner_epochs_uid ON miner_epochs(miner_uid); + CREATE INDEX IF NOT EXISTS idx_miner_epochs_epoch ON miner_epochs(epoch_id); + CREATE INDEX IF NOT EXISTS idx_task_results_epoch ON task_results(epoch_id); + CREATE INDEX IF NOT EXISTS idx_checkpoints_epoch ON checkpoints(epoch_id); + CREATE INDEX IF NOT EXISTS idx_api_keys_key ON api_keys(api_key); + """, + ), +] + + +class MigrationManager: + """Simple versioned migration manager for SQLite.""" + + def __init__(self, conn: sqlite3.Connection): + self.conn = conn + self._ensure_version_table() + + def _ensure_version_table(self) -> None: + """Create the schema_version table if it doesn't exist.""" + self.conn.execute( + """CREATE TABLE IF NOT EXISTS schema_version ( + version INTEGER NOT NULL DEFAULT 0 + )""" + ) + # Ensure there is exactly one row + row = self.conn.execute("SELECT COUNT(*) FROM schema_version").fetchone() + if row[0] == 0: + self.conn.execute("INSERT INTO schema_version (version) VALUES (0)") + self.conn.commit() + + def get_current_version(self) -> int: + """Get the current schema version.""" + row = self.conn.execute("SELECT version FROM schema_version LIMIT 1").fetchone() + return row[0] if row else 0 + + def apply_pending(self) -> int: + """Apply all pending migrations. Returns number of migrations applied.""" + current = self.get_current_version() + applied = 0 + + for version, description, sql in MIGRATIONS: + if version > current: + logger.info("Applying migration %d: %s", version, description) + self.conn.executescript(sql) + self.conn.execute("UPDATE schema_version SET version = ?", (version,)) + self.conn.commit() + applied += 1 + current = version + + return applied diff --git a/reasonforge/task_generator.py b/reasonforge/task_generator.py index 0eedb28..32b8e83 100644 --- a/reasonforge/task_generator.py +++ b/reasonforge/task_generator.py @@ -9,10 +9,9 @@ import random import uuid -from typing import List - -from .types import Domain, Task, TaskSource, TRAP_RATE +from typing import List, Optional, Tuple +from .types import TRAP_RATE, Domain, Task, TaskSource # ────────────────────────────────────────────── # Task Templates (5+ per domain) @@ -88,7 +87,10 @@ ("In a zero-sum game with payoff matrix [[1,-1],[-1,1]], find the Nash equilibrium.", 0.9), ], Domain.CAUSAL: [ - ("In X->Y with no confounders, what is the adjustment set for estimating causal effect?", 0.95), + ( + "In X->Y with no confounders, what is the adjustment set for estimating causal effect?", + 0.95, + ), ], Domain.ETHICAL: [ ("List three major ethical frameworks used in moral philosophy.", 0.9), @@ -99,7 +101,7 @@ class TaskGenerator: """Generates synthetic reasoning tasks with trap injection.""" - def __init__(self, seed: int = None): + def __init__(self, seed: Optional[int] = None): self.rng = random.Random(seed) def generate_tasks(self, count: int = 12) -> List[Task]: @@ -117,8 +119,10 @@ def generate_tasks(self, count: int = 12) -> List[Task]: # Generate trap tasks for _ in range(trap_count): domain = self.rng.choice(list(Domain)) - templates = TRAP_TEMPLATES.get(domain, TRAP_TEMPLATES[Domain.MATHEMATICS]) - problem, truth = self.rng.choice(templates) + trap_templates: List[Tuple[str, float]] = TRAP_TEMPLATES.get( + domain, TRAP_TEMPLATES[Domain.MATHEMATICS] + ) + problem, truth = self.rng.choice(trap_templates) task = Task( task_id=str(uuid.uuid4()), problem=problem, @@ -135,8 +139,8 @@ def generate_tasks(self, count: int = 12) -> List[Task]: domains = list(Domain) for _ in range(regular_count): domain = self.rng.choice(domains) - templates = TASK_TEMPLATES[domain] - problem = self.rng.choice(templates) + regular_templates: List[str] = TASK_TEMPLATES[domain] + problem = self.rng.choice(regular_templates) task = Task( task_id=str(uuid.uuid4()), problem=problem, diff --git a/reasonforge/types.py b/reasonforge/types.py index 48edee5..2fe8eb2 100644 --- a/reasonforge/types.py +++ b/reasonforge/types.py @@ -14,7 +14,6 @@ from enum import Enum from typing import Dict, List, Optional - # ────────────────────────────────────────────── # Protocol Constants # ────────────────────────────────────────────── @@ -62,9 +61,16 @@ # Difficulty Multiplier Map (difficulty 1-10 -> D(t)) DIFFICULTY_MULTIPLIER: Dict[int, float] = { - 1: 1.0, 2: 1.0, 3: 1.25, 4: 1.25, - 5: 1.5, 6: 1.5, 7: 1.75, 8: 1.75, - 9: 2.0, 10: 2.0, + 1: 1.0, + 2: 1.0, + 3: 1.25, + 4: 1.25, + 5: 1.5, + 6: 1.5, + 7: 1.75, + 8: 1.75, + 9: 2.0, + 10: 2.0, } # Objective/Consensus Split (Eq. 13) @@ -77,6 +83,7 @@ # Enums # ────────────────────────────────────────────── + class Domain(str, Enum): MATHEMATICS = "mathematics" CODE = "code" @@ -108,9 +115,11 @@ class TaskSource(str, Enum): # Dataclasses # ────────────────────────────────────────────── + @dataclass class Task: """A reasoning task assigned to miners.""" + task_id: str = field(default_factory=lambda: str(uuid.uuid4())) problem: str = "" domain: Domain = Domain.MATHEMATICS @@ -129,6 +138,7 @@ def difficulty_multiplier(self) -> float: @dataclass class ReasoningStep: """A single step in a miner's reasoning chain.""" + step_id: int = 0 reasoning: str = "" evidence: str = "" @@ -139,6 +149,7 @@ class ReasoningStep: @dataclass class MinerSubmission: """A miner's submission for a task.""" + task_id: str = "" miner_id: str = "" steps: List[ReasoningStep] = field(default_factory=list) @@ -160,6 +171,7 @@ def compute_hash(self) -> str: @dataclass class DimensionScores: """Scores across the four quality dimensions.""" + quality: float = 0.0 accuracy: float = 0.0 novelty: float = 0.0 @@ -179,6 +191,7 @@ def cms(self) -> float: @dataclass class ValidatorScore: """A validator's score for a specific miner's submission.""" + validator_id: str = "" objective_score: float = 0.0 consensus_score: float = 0.0 @@ -189,6 +202,7 @@ class ValidatorScore: @dataclass class MinerState: """Persistent miner state across epochs.""" + miner_id: str = "" name: str = "" epoch_scores: List[float] = field(default_factory=list) @@ -223,6 +237,7 @@ def trap_penalty(self) -> float: @dataclass class ValidatorState: """Persistent validator state across epochs.""" + validator_id: str = "" name: str = "" stake: float = 0.0 @@ -264,15 +279,14 @@ def compute_slash(self) -> float: if avg >= VAS_SLASH_THRESHOLD: self.slashed_amount = 0.0 else: - self.slashed_amount = ( - VAS_SLASH_GAMMA * self.stake * (VAS_SLASH_THRESHOLD - avg) ** 2 - ) + self.slashed_amount = VAS_SLASH_GAMMA * self.stake * (VAS_SLASH_THRESHOLD - avg) ** 2 return self.slashed_amount @dataclass class EpochResult: """Results from a single epoch simulation.""" + epoch_id: int = 0 total_emission: float = 0.0 miner_pool: float = 0.0 diff --git a/reasonforge/validator/__init__.py b/reasonforge/validator/__init__.py new file mode 100644 index 0000000..566c6e0 --- /dev/null +++ b/reasonforge/validator/__init__.py @@ -0,0 +1,20 @@ +""" +ReasonForge - Validator Module + +Provides the validator-side scoring pipeline, consensus, weight setting, +task management, and trap management. +""" + +from .consensus import compute_consensus_score +from .scoring import ValidatorScorer +from .task_manager import TaskManager +from .trap_manager import TrapManager +from .weight_setter import WeightSetter + +__all__ = [ + "ValidatorScorer", + "compute_consensus_score", + "WeightSetter", + "TaskManager", + "TrapManager", +] diff --git a/reasonforge/validator/consensus.py b/reasonforge/validator/consensus.py new file mode 100644 index 0000000..4fc017e --- /dev/null +++ b/reasonforge/validator/consensus.py @@ -0,0 +1,31 @@ +""" +ReasonForge - Consensus Module + +Stake-weighted trimmed median for multi-validator consensus scoring. +Wraps the MVP ScoringEngine's consensus computation for production use. +""" + +from __future__ import annotations + +from typing import List, Tuple + +from ..engine import ScoringEngine +from ..types import CONSENSUS_TRIM_DELTA + + +def compute_consensus_score( + validator_scores: List[Tuple[float, float]], + trim_delta: float = CONSENSUS_TRIM_DELTA, +) -> float: + """ + Stake-weighted trimmed median (Eq. 12). + Reuses ScoringEngine.compute_consensus_score from MVP. + + Args: + validator_scores: List of (score, stake) tuples. + trim_delta: Fraction to trim from top and bottom. + + Returns: + Consensus score as a float. + """ + return ScoringEngine.compute_consensus_score(validator_scores, trim_delta) diff --git a/reasonforge/validator/objective_scorer.py b/reasonforge/validator/objective_scorer.py new file mode 100644 index 0000000..83fa6da --- /dev/null +++ b/reasonforge/validator/objective_scorer.py @@ -0,0 +1,252 @@ +""" +ReasonForge - Objective Scorer + +Domain-specific automated checks for scoring miner submissions. +Provides the objective component of the scoring pipeline. +""" + +from __future__ import annotations + +import logging +from typing import Dict + +from ..types import DOMAIN_CHECK_WEIGHTS, Domain, Task + +logger = logging.getLogger("reasonforge.validator.objective_scorer") + + +class ObjectiveScorer: + """Domain-specific automated checks for miner submissions.""" + + def __init__( + self, + lean4_checker=None, + code_sandbox=None, + math_checker=None, + fact_checker=None, + ): + self.lean4 = lean4_checker + self.sandbox = code_sandbox + self.math_checker = math_checker + self.fact_checker = fact_checker + + async def compute_objective_score(self, task: Task, response_data: dict) -> float: + """ + Compute objective score using domain-specific automated checks. + Maps to Eq. 11 in the whitepaper. + """ + domain = task.domain if isinstance(task.domain, Domain) else Domain(task.domain) + weights = DOMAIN_CHECK_WEIGHTS.get(domain, {}) + + checks = await self._run_domain_checks(domain, task, response_data) + + # Use engine's objective score formula + from ..engine import ScoringEngine + + return ScoringEngine.compute_objective_score(checks, weights) + + async def _run_domain_checks( + self, domain: Domain, task: Task, response: dict + ) -> Dict[str, float]: + """Run domain-specific verification checks.""" + if domain == Domain.MATHEMATICS: + return await self._check_mathematics(task, response) + elif domain == Domain.CODE: + return await self._check_code(task, response) + elif domain == Domain.SCIENTIFIC: + return await self._check_scientific(task, response) + elif domain == Domain.STRATEGIC: + return await self._check_strategic(task, response) + elif domain == Domain.CAUSAL: + return await self._check_causal(task, response) + elif domain == Domain.ETHICAL: + return await self._check_ethical(task, response) + return {} + + async def _check_mathematics(self, task: Task, response: dict) -> Dict[str, float]: + checks = {} + + # Lean4 proof verification + if self.lean4 and response.get("proof_artifact"): + try: + checks["proof"] = await self.lean4.verify(response["proof_artifact"]) + except Exception as e: + logger.warning("Lean4 verification failed: %s", e) + checks["proof"] = 0.0 + else: + # No proof artifact: partial credit for reasoning + checks["proof"] = 0.3 if response.get("steps") else 0.0 + + # Numerical verification + if self.math_checker: + try: + checks["numerical"] = self.math_checker.verify( + task.problem, response.get("final_answer", "") + ) + except Exception: + checks["numerical"] = 0.5 + else: + checks["numerical"] = 0.5 # Default when no checker available + + # Step verification + checks["steps"] = self._verify_reasoning_steps(response.get("steps", [])) + + return checks + + async def _check_code(self, task: Task, response: dict) -> Dict[str, float]: + checks = {} + + if self.sandbox and response.get("code_artifact"): + try: + checks["tests"] = await self.sandbox.run_tests(response["code_artifact"]) + checks["static_analysis"] = await self.sandbox.lint(response["code_artifact"]) + except Exception as e: + logger.warning("Code sandbox check failed: %s", e) + checks["tests"] = 0.0 + checks["static_analysis"] = 0.0 + else: + checks["tests"] = 0.3 if response.get("code_artifact") else 0.0 + checks["static_analysis"] = 0.5 + + checks["formal"] = 0.5 if response.get("steps") else 0.0 + + return checks + + async def _check_scientific(self, task: Task, response: dict) -> Dict[str, float]: + steps = response.get("steps", []) + checks = { + "simulation": self._check_quantitative_content(steps), + "statistics": self._check_statistical_content(steps), + "citations": self._check_citation_content(steps), + } + return checks + + async def _check_strategic(self, task: Task, response: dict) -> Dict[str, float]: + steps = response.get("steps", []) + checks = { + "solver": self._check_formal_solution(steps), + "constraints": self._check_constraint_handling(steps), + "equilibrium": self._check_equilibrium_analysis(steps), + } + return checks + + async def _check_causal(self, task: Task, response: dict) -> Dict[str, float]: + _steps = response.get("steps", []) # noqa: F841 — reserved for future causal DAG checks + answer = response.get("final_answer", "").lower() + checks = { + "docalculus": 0.7 + if any(kw in answer for kw in ["do(", "intervention", "do-calculus"]) + else 0.3, + "bootstrap": 0.5 if "confidence" in answer or "interval" in answer else 0.3, + "dag": 0.7 + if any(kw in answer for kw in ["dag", "graph", "node", "edge", "path"]) + else 0.3, + } + return checks + + async def _check_ethical(self, task: Task, response: dict) -> Dict[str, float]: + steps = response.get("steps", []) + answer = response.get("final_answer", "").lower() + all_text = " ".join([s.get("reasoning", "") for s in steps]) + " " + answer + all_text_lower = all_text.lower() + + frameworks = ["utilitarian", "deontolog", "virtue", "consequential", "kantian", "rawls"] + framework_count = sum(1 for f in frameworks if f in all_text_lower) + + checks = { + "coverage": min(1.0, framework_count / 3.0), + "logic": self._check_logical_structure(steps), + } + return checks + + def _verify_reasoning_steps(self, steps: list) -> float: + """Score the quality of reasoning steps.""" + if not steps: + return 0.0 + + score = 0.0 + for step in steps: + reasoning = step.get("reasoning", "") + # Check for substantive content + if len(reasoning) > 50: + score += 0.3 + if step.get("evidence"): + score += 0.2 + if step.get("confidence", 0) > 0.5: + score += 0.1 + + return min(1.0, score / max(1, len(steps))) + + def _check_quantitative_content(self, steps: list) -> float: + """Check for quantitative/numerical content in steps.""" + import re + + all_text = " ".join(s.get("reasoning", "") for s in steps) + numbers = re.findall(r"\d+\.?\d*", all_text) + return min(1.0, len(numbers) / 10.0) if numbers else 0.2 + + def _check_statistical_content(self, steps: list) -> float: + all_text = " ".join(s.get("reasoning", "") for s in steps).lower() + keywords = [ + "mean", + "variance", + "standard deviation", + "p-value", + "confidence interval", + "regression", + "correlation", + "hypothesis", + ] + found = sum(1 for kw in keywords if kw in all_text) + return min(1.0, found / 3.0) + + def _check_citation_content(self, steps: list) -> float: + all_text = " ".join(s.get("reasoning", "") for s in steps) + indicators = ["et al", "according to", "study", "research", "paper", "["] + found = sum(1 for ind in indicators if ind.lower() in all_text.lower()) + return min(1.0, found / 2.0) + + def _check_formal_solution(self, steps: list) -> float: + all_text = " ".join(s.get("reasoning", "") for s in steps).lower() + keywords = [ + "optimal", + "maximize", + "minimize", + "equilibrium", + "solution", + "payoff", + "strategy", + "constraint", + ] + found = sum(1 for kw in keywords if kw in all_text) + return min(1.0, found / 3.0) + + def _check_constraint_handling(self, steps: list) -> float: + all_text = " ".join(s.get("reasoning", "") for s in steps).lower() + if "constraint" in all_text or "subject to" in all_text or "bound" in all_text: + return 0.7 + return 0.3 + + def _check_equilibrium_analysis(self, steps: list) -> float: + all_text = " ".join(s.get("reasoning", "") for s in steps).lower() + keywords = ["nash", "equilibrium", "dominant", "pareto", "mixed strategy"] + found = sum(1 for kw in keywords if kw in all_text) + return min(1.0, found / 2.0) + + def _check_logical_structure(self, steps: list) -> float: + """Check for logical reasoning structure.""" + if not steps: + return 0.0 + all_text = " ".join(s.get("reasoning", "") for s in steps).lower() + connectors = [ + "therefore", + "because", + "however", + "moreover", + "furthermore", + "on the other hand", + "in contrast", + "consequently", + ] + found = sum(1 for c in connectors if c in all_text) + return min(1.0, 0.3 + found * 0.15) diff --git a/reasonforge/validator/scoring.py b/reasonforge/validator/scoring.py new file mode 100644 index 0000000..38f8049 --- /dev/null +++ b/reasonforge/validator/scoring.py @@ -0,0 +1,178 @@ +""" +ReasonForge - Validator Scoring Pipeline + +Orchestrates the full scoring pipeline for miner submissions. +Wraps the MVP ScoringEngine and adds production verification backends. +""" + +from __future__ import annotations + +import logging + +from ..engine import ScoringEngine +from ..types import DimensionScores, Task +from .objective_scorer import ObjectiveScorer + +logger = logging.getLogger("reasonforge.validator.scoring") + + +class ValidatorScorer: + """ + Orchestrates the scoring pipeline using the MVP's ScoringEngine. + Adds objective verification backends on top of the formula layer. + """ + + def __init__( + self, + lean4_enabled: bool = False, + sandbox_enabled: bool = False, + ): + self.engine = ScoringEngine() + + # Initialize verification backends + lean4_checker = None + code_sandbox = None + math_checker = None + fact_checker = None + + if lean4_enabled: + try: + from ..verification.lean4_checker import Lean4Checker + + lean4_checker = Lean4Checker() + except ImportError: + logger.warning("Lean4 checker not available") + + if sandbox_enabled: + try: + from ..verification.code_sandbox import CodeSandbox + + code_sandbox = CodeSandbox() + except ImportError: + logger.warning("Code sandbox not available") + + try: + from ..verification.math_checker import MathChecker + + math_checker = MathChecker() + except ImportError: + logger.debug("Math checker not available") + + try: + from ..verification.fact_checker import FactChecker + + fact_checker = FactChecker() + except ImportError: + logger.debug("Fact checker not available") + + self.objective_scorer = ObjectiveScorer( + lean4_checker=lean4_checker, + code_sandbox=code_sandbox, + math_checker=math_checker, + fact_checker=fact_checker, + ) + + async def compute_dimensions(self, task: Task, response: dict) -> DimensionScores: + """ + Compute all 4 dimension scores for a miner's response. + Maps to Quality, Accuracy, Novelty, Efficiency in the whitepaper. + """ + quality = self._score_quality(task, response) + accuracy = await self._score_accuracy(task, response) + novelty = self._score_novelty(task, response) + efficiency = self._score_efficiency(task, response) + + return DimensionScores( + quality=quality, + accuracy=accuracy, + novelty=novelty, + efficiency=efficiency, + ) + + def _score_quality(self, task: Task, response: dict) -> float: + """ + Quality (40% of CMS): + - Step coherence, completeness, depth + - Formal proof fragments (bonus) + """ + steps = response.get("steps", []) + if not steps: + return 0.0 + + # Step count vs difficulty expectation + expected_steps = max(3, task.difficulty) + step_ratio = min(1.0, len(steps) / expected_steps) + + # Average confidence + confidences = [s.get("confidence", 0) for s in steps] + avg_confidence = sum(confidences) / len(confidences) + + # Evidence presence + evidence_count = sum(1 for s in steps if s.get("evidence")) + evidence_ratio = evidence_count / len(steps) + + # Proof fragment bonus + proof_bonus = 0.1 if any(s.get("formal_proof_fragment") for s in steps) else 0.0 + + return min( + 1.0, + (0.3 * step_ratio) + + (0.3 * avg_confidence) + + (0.2 * evidence_ratio) + + (0.2 + proof_bonus), + ) + + async def _score_accuracy(self, task: Task, response: dict) -> float: + """ + Accuracy (30% of CMS): + Domain-specific automated checks via ObjectiveScorer. + """ + try: + return await self.objective_scorer.compute_objective_score(task, response) + except Exception as e: + logger.warning("Accuracy scoring failed: %s", e) + return 0.3 # Default fallback + + def _score_novelty(self, task: Task, response: dict) -> float: + """ + Novelty (15% of CMS): + - Unique approach vs common solutions + - Creative reasoning paths + """ + steps = response.get("steps", []) + if not steps: + return 0.0 + + # Step text length as proxy for reasoning depth + avg_step_length = sum(len(s.get("reasoning", "")) for s in steps) / len(steps) + length_score = min(1.0, avg_step_length / 500) + + # Vocabulary diversity + all_words = " ".join(s.get("reasoning", "") for s in steps).lower().split() + if all_words: + diversity = len(set(all_words)) / len(all_words) + else: + diversity = 0.0 + + return min(1.0, 0.5 * length_score + 0.5 * diversity) + + def _score_efficiency(self, task: Task, response: dict) -> float: + """ + Efficiency (15% of CMS): + - Solve time relative to timeout + - Conciseness + """ + time_ms = response.get("time_taken_ms") or (task.timeout_seconds * 1000) + timeout_ms = task.timeout_seconds * 1000 + + time_ratio = time_ms / timeout_ms + if time_ratio < 0.01: + # Suspiciously fast + time_score = 0.2 + elif time_ratio > 1.0: + # Timed out + time_score = 0.0 + else: + time_score = 1.0 - (time_ratio * 0.5) + + return min(1.0, time_score) diff --git a/reasonforge/validator/task_manager.py b/reasonforge/validator/task_manager.py new file mode 100644 index 0000000..7ebade3 --- /dev/null +++ b/reasonforge/validator/task_manager.py @@ -0,0 +1,215 @@ +""" +ReasonForge - Task Manager + +Manages task generation, queuing, dispatch, and assignment for validators. +Loads from benchmark database, handles API-submitted tasks, and injects traps. +""" + +from __future__ import annotations + +import json +import logging +import random +import uuid +from pathlib import Path +from typing import Dict, List + +from ..types import TRAP_RATE, Domain, Task, TaskSource + +logger = logging.getLogger("reasonforge.validator.task_manager") + + +class TaskManager: + """Production task manager with benchmark DB + synthetic + API ingestion.""" + + def __init__( + self, + benchmark_dir: str = "benchmarks", + seed: int | None = None, + ): + self.rng = random.Random(seed) + self.benchmark_dir = benchmark_dir + self.benchmark_tasks: Dict[str, List[dict]] = {} + self.used_task_ids: set[str] = set() + self.api_queue: List[Task] = [] + + self._load_benchmarks() + + def _load_benchmarks(self) -> None: + """Load benchmark tasks from JSON files.""" + benchmark_path = Path(self.benchmark_dir) + if not benchmark_path.exists(): + logger.warning("Benchmark directory not found: %s", self.benchmark_dir) + return + + for domain_dir in benchmark_path.iterdir(): + if not domain_dir.is_dir(): + continue + domain_name = domain_dir.name + self.benchmark_tasks[domain_name] = [] + + for json_file in domain_dir.glob("*.json"): + try: + with open(json_file, "r", encoding="utf-8") as f: + tasks = json.load(f) + if isinstance(tasks, list): + self.benchmark_tasks[domain_name].extend(tasks) + logger.info( + "Loaded %d tasks from %s", + len(tasks) if isinstance(tasks, list) else 0, + json_file, + ) + except (json.JSONDecodeError, IOError) as e: + logger.warning("Failed to load %s: %s", json_file, e) + + total = sum(len(v) for v in self.benchmark_tasks.values()) + logger.info( + "Loaded %d total benchmark tasks across %d domains", total, len(self.benchmark_tasks) + ) + + def generate_epoch_tasks(self, count: int = 12, trap_rate: float = TRAP_RATE) -> List[Task]: + """Generate a balanced set of tasks for one epoch.""" + n_traps = max(1, int(count * trap_rate)) + n_regular = count - n_traps + + tasks = [] + + # 1. Check for API-submitted tasks first + api_tasks: list = [] + while self.api_queue and len(api_tasks) < n_regular // 2: + api_tasks.append(self.api_queue.pop(0)) + tasks.extend(api_tasks) + + # 2. Sample benchmark tasks (balanced across domains) + remaining = n_regular - len(api_tasks) + tasks.extend(self._sample_balanced(remaining)) + + # 3. Add trap problems + tasks.extend(self._sample_traps(n_traps)) + + # 4. Shuffle to hide traps + self.rng.shuffle(tasks) + + return tasks + + def _sample_balanced(self, count: int) -> List[Task]: + """Sample tasks balanced across domains.""" + tasks = [] + domains = list(Domain) + per_domain = max(1, count // len(domains)) + + for domain in domains: + domain_key = domain.value + available = self.benchmark_tasks.get(domain_key, []) + + # Filter out already-used tasks + available = [t for t in available if t.get("task_id", "") not in self.used_task_ids] + + if available: + sampled = self.rng.sample(available, min(per_domain, len(available))) + for task_data in sampled: + task = self._task_from_benchmark(task_data, domain) + tasks.append(task) + self.used_task_ids.add(task.task_id) + else: + # Fall back to synthetic generation + tasks.extend(self._generate_synthetic(per_domain, domain)) + + # If we still need more, add synthetic tasks + while len(tasks) < count: + domain = self.rng.choice(domains) + tasks.extend(self._generate_synthetic(1, domain)) + + return tasks[:count] + + def _sample_traps(self, count: int) -> List[Task]: + """Sample trap problems with known ground truth.""" + traps = [] + for domain_key, task_list in self.benchmark_tasks.items(): + for task_data in task_list: + if task_data.get("is_trap", False): + traps.append(task_data) + + if traps: + sampled = self.rng.sample(traps, min(count, len(traps))) + return [ + self._task_from_benchmark(t, Domain(t.get("domain", "mathematics"))) + for t in sampled + ] + + # Fallback: generate simple trap tasks + return self._generate_synthetic_traps(count) + + def _task_from_benchmark(self, data: dict, domain: Domain) -> Task: + """Convert a benchmark JSON dict to a Task object.""" + return Task( + task_id=data.get("task_id", str(uuid.uuid4())), + problem=data.get("problem", ""), + domain=domain, + difficulty=data.get("difficulty", 5), + timeout_seconds=data.get("timeout_seconds", 300), + source=TaskSource.TRAP if data.get("is_trap") else TaskSource.BENCHMARK, + is_trap=data.get("is_trap", False), + ground_truth_score=data.get("ground_truth_score"), + previously_unsolved=data.get("previously_unsolved", False), + ) + + def _generate_synthetic(self, count: int, domain: Domain) -> List[Task]: + """Generate synthetic tasks for a domain.""" + from ..task_generator import TASK_TEMPLATES + + templates = TASK_TEMPLATES.get(domain, []) + tasks = [] + for _ in range(count): + if templates: + problem = self.rng.choice(templates) + else: + problem = f"Solve a {domain.value} reasoning problem." + tasks.append( + Task( + task_id=str(uuid.uuid4()), + problem=problem, + domain=domain, + difficulty=self.rng.randint(2, 9), + source=TaskSource.SYNTHETIC, + is_trap=False, + previously_unsolved=self.rng.random() < 0.05, + ) + ) + return tasks + + def _generate_synthetic_traps(self, count: int) -> List[Task]: + """Generate synthetic trap tasks with known answers.""" + from ..task_generator import TRAP_TEMPLATES + + traps = [] + for _ in range(count): + domain = self.rng.choice(list(Domain)) + templates = TRAP_TEMPLATES.get(domain, TRAP_TEMPLATES[Domain.MATHEMATICS]) + problem, truth = self.rng.choice(templates) + traps.append( + Task( + task_id=str(uuid.uuid4()), + problem=problem, + domain=domain, + difficulty=self.rng.randint(2, 5), + source=TaskSource.TRAP, + is_trap=True, + ground_truth_score=truth, + ) + ) + return traps + + def submit_api_task( + self, problem: str, domain: str | None = None, difficulty: int | None = None + ) -> Task: + """Accept an external task submission via the API gateway.""" + task = Task( + task_id=str(uuid.uuid4()), + problem=problem, + domain=Domain(domain) if domain else self.rng.choice(list(Domain)), + difficulty=difficulty or self.rng.randint(3, 8), + source=TaskSource.USER_API, + ) + self.api_queue.append(task) + return task diff --git a/reasonforge/validator/trap_manager.py b/reasonforge/validator/trap_manager.py new file mode 100644 index 0000000..d6896b4 --- /dev/null +++ b/reasonforge/validator/trap_manager.py @@ -0,0 +1,136 @@ +""" +ReasonForge - Trap Manager + +Injects trap problems with known ground-truth scores and evaluates +miner responses against them for integrity checking. +""" + +from __future__ import annotations + +import logging +from collections import defaultdict +from typing import Dict, List + +from ..types import TRAP_RATE, Task + +logger = logging.getLogger("reasonforge.validator.trap_manager") + + +class TrapManager: + """Inject trap problems and evaluate responses against ground truth.""" + + def __init__(self, trap_rate: float = TRAP_RATE): + self.trap_rate = trap_rate + self.trap_results: Dict[int, List[float]] = defaultdict(list) # uid -> scores + + def evaluate_trap_response( + self, + task: Task, + final_answer: str | None, + reasoning_steps: list[dict] | None = None, + ) -> float: + """ + Compare miner response against ground truth. + Returns a score 0.0-1.0 indicating how correct the response is. + """ + if not task.is_trap or task.ground_truth_score is None: + return 1.0 + + if not final_answer: + return 0.0 + + domain_val = task.domain.value if hasattr(task.domain, "value") else task.domain + + if domain_val == "mathematics": + return self._evaluate_math_trap(task, final_answer) + elif domain_val == "code": + return self._evaluate_code_trap(task, final_answer) + else: + return self._evaluate_general_trap(task, final_answer, reasoning_steps) + + def _evaluate_math_trap(self, task: Task, answer: str) -> float: + """Evaluate math trap response.""" + answer_lower = answer.lower().strip() + problem_lower = task.problem.lower() + + # Simple numeric checks for known traps + if "2+2" in problem_lower or "2 + 2" in problem_lower: + if "4" in answer_lower: + return 1.0 + return 0.0 + + if "prime" in problem_lower and "7" in problem_lower: + if "prime" in answer_lower and ( + "yes" in answer_lower or "is prime" in answer_lower or "true" in answer_lower + ): + return 0.95 + return 0.2 + + if "derivative" in problem_lower and "x^3" in problem_lower: + if "3x^2" in answer_lower or "3x²" in answer_lower or "3 x^2" in answer_lower: + return 1.0 + return 0.1 + + # Default: check if answer is non-empty and contains reasoning + return 0.5 if len(answer_lower) > 20 else 0.2 + + def _evaluate_code_trap(self, task: Task, answer: str) -> float: + """Evaluate code trap response.""" + answer_lower = answer.lower() + + if "maximum" in task.problem.lower() or "max" in task.problem.lower(): + # Check for function-like code + if "def " in answer_lower or "function" in answer_lower: + if "return" in answer_lower: + return 0.9 + return 0.3 + + if "binary search" in task.problem.lower(): + if "def " in answer_lower and ("mid" in answer_lower or "middle" in answer_lower): + return 0.9 + return 0.3 + + return 0.5 if len(answer) > 50 else 0.2 + + def _evaluate_general_trap( + self, + task: Task, + answer: str, + reasoning_steps: list[dict] | None = None, + ) -> float: + """Evaluate general trap response.""" + if not answer or len(answer) < 10: + return 0.1 + + # Basic quality heuristics + score = 0.3 + if len(answer) > 100: + score += 0.2 + if reasoning_steps and len(reasoning_steps) >= 2: + score += 0.2 + if any(kw in answer.lower() for kw in ["because", "therefore", "thus", "hence"]): + score += 0.1 + + return min(1.0, score) + + def record_trap_score(self, uid: int, score: float) -> None: + """Record a trap score for a miner.""" + self.trap_results[uid].append(score) + + def get_trap_scores(self, uid: int) -> List[float]: + """Get all trap scores for a miner.""" + return self.trap_results.get(uid, []) + + def get_trap_penalty(self, uid: int) -> float: + """Compute trap penalty for a miner based on their trap scores.""" + scores = self.trap_results.get(uid, []) + if not scores: + return 1.0 + from ..engine import ScoringEngine + + return ScoringEngine.compute_trap_penalty(scores) + + def reset_epoch(self) -> None: + """Reset per-epoch trap tracking (keep historical data).""" + # We keep the data across epochs for cumulative tracking + pass diff --git a/reasonforge/validator/weight_setter.py b/reasonforge/validator/weight_setter.py new file mode 100644 index 0000000..cf0b99f --- /dev/null +++ b/reasonforge/validator/weight_setter.py @@ -0,0 +1,117 @@ +""" +ReasonForge - Weight Setter + +Computes normalized weight vector from epoch scores and submits to chain. +""" + +from __future__ import annotations + +import logging +import time +from typing import Dict, Tuple + +logger = logging.getLogger("reasonforge.validator.weight_setter") + +try: + import torch + + HAS_TORCH = True +except ImportError: + HAS_TORCH = False + +try: + import bittensor as bt # noqa: F401 + + HAS_BITTENSOR = True +except ImportError: + HAS_BITTENSOR = False + + +class WeightSetter: + """Compute and submit on-chain weights from epoch scores.""" + + def __init__(self, subtensor=None, wallet=None, config=None): + self.subtensor = subtensor + self.wallet = wallet + self.config = config + + def compute_weights( + self, + miner_states: Dict[int, dict], + n: int, + ) -> Tuple: + """ + Convert S_epoch + PEB into normalized weight vector. + This is the core mapping from off-chain scoring -> on-chain Yuma Consensus input. + + Args: + miner_states: Dict of uid -> {s_epoch, peb} data. + n: Total number of UIDs in metagraph. + + Returns: + Tuple of (uids_tensor, weights_tensor). + """ + uids = [] + weights = [] + + for uid in range(n): + if uid in miner_states: + ms = miner_states[uid] + s_epoch = ( + ms.get("s_epoch", 0.0) if isinstance(ms, dict) else getattr(ms, "s_epoch", 0.0) + ) + peb = ms.get("peb", 0.0) if isinstance(ms, dict) else getattr(ms, "peb", 0.0) + + if s_epoch > 0: + w = s_epoch * (1.0 + peb) + uids.append(uid) + weights.append(w) + + if not weights: + if HAS_TORCH: + return torch.tensor([]), torch.tensor([]) + return [], [] + + # Normalize + if HAS_TORCH: + weight_tensor = torch.FloatTensor(weights) + weight_tensor = weight_tensor / weight_tensor.sum() + return torch.tensor(uids), weight_tensor + else: + total = sum(weights) + normalized = [w / total for w in weights] + return uids, normalized + + def submit( + self, + uids, + weights, + netuid: int, + max_retries: int = 3, + ) -> bool: + """Submit weights to chain with retry logic.""" + if not HAS_BITTENSOR or not self.subtensor or not self.wallet: + logger.warning("Cannot submit weights: bittensor not available or not configured") + return False + + for attempt in range(max_retries): + try: + success = self.subtensor.set_weights( + netuid=netuid, + wallet=self.wallet, + uids=uids, + weights=weights, + wait_for_inclusion=True, + wait_for_finalization=False, + ) + if success: + logger.info("Weights set successfully (attempt %d)", attempt + 1) + return True + else: + logger.warning("Weight setting returned False (attempt %d)", attempt + 1) + except Exception as e: + logger.warning("Weight setting attempt %d failed: %s", attempt + 1, e) + time.sleep(5) + + logger.error("Failed to set weights after %d attempts", max_retries) + return False diff --git a/reasonforge/verification/__init__.py b/reasonforge/verification/__init__.py new file mode 100644 index 0000000..87365d8 --- /dev/null +++ b/reasonforge/verification/__init__.py @@ -0,0 +1,16 @@ +""" +ReasonForge - Formal Verification Backends + +Provides verification tools for miner submissions: +- Lean 4 proof checking +- Sandboxed code execution +- Mathematical/numerical verification via SymPy +- Factual claim verification +""" + +from .code_sandbox import CodeSandbox +from .fact_checker import FactChecker +from .lean4_checker import Lean4Checker +from .math_checker import MathChecker + +__all__ = ["Lean4Checker", "CodeSandbox", "MathChecker", "FactChecker"] diff --git a/reasonforge/verification/code_sandbox.py b/reasonforge/verification/code_sandbox.py new file mode 100644 index 0000000..5fefdd1 --- /dev/null +++ b/reasonforge/verification/code_sandbox.py @@ -0,0 +1,200 @@ +""" +ReasonForge - Code Sandbox + +Run miner code submissions in an isolated Docker container. +Prevents: filesystem access, network access, fork bombs, resource exhaustion. +""" + +from __future__ import annotations + +import asyncio +import base64 +import logging +import re +from typing import Optional + +logger = logging.getLogger("reasonforge.verification.sandbox") + + +class CodeSandbox: + """Execute code in a sandboxed Docker container.""" + + def __init__( + self, + image: str = "reasonforge-sandbox:latest", + timeout: int = 30, + memory_limit: str = "256m", + cpu_quota: int = 50000, + ): + self.image = image + self.timeout = timeout + self.memory_limit = memory_limit + self.cpu_period = 100000 + self.cpu_quota = cpu_quota + self._client = None + self._available: Optional[bool] = None + + def _get_client(self): + """Lazy-initialize Docker client.""" + if self._client is None: + try: + import docker + + self._client = docker.from_env() + except ImportError: + raise ImportError( + "docker package not installed. Install with: pip install docker>=7.0.0" + ) + except Exception as e: + logger.error("Docker not available: %s", e) + raise + return self._client + + async def is_available(self) -> bool: + """Check if Docker is available.""" + if self._available is not None: + return self._available + try: + client = self._get_client() + client.ping() + self._available = True + except Exception: + self._available = False + return self._available + + async def run_tests(self, code_b64: str) -> float: + """ + Execute code in sandbox and run any included test cases. + + Returns: + Float 0.0-1.0 based on test pass rate. + """ + try: + code = base64.b64decode(code_b64).decode("utf-8") + except Exception: + return 0.0 + + # Validate code size + if len(code) > 500_000: + logger.warning("Code artifact too large") + return 0.0 + + # Security check: reject obviously dangerous code + if self._contains_dangerous_patterns(code): + logger.warning("Code contains dangerous patterns") + return 0.0 + + try: + client = self._get_client() + + # Run in isolated container + loop = asyncio.get_event_loop() + container = await loop.run_in_executor( + None, + lambda: client.containers.run( + self.image, + command=["python3", "-c", code], + detach=True, + mem_limit=self.memory_limit, + cpu_period=self.cpu_period, + cpu_quota=self.cpu_quota, + network_disabled=True, + read_only=True, + tmpfs={"/tmp": "size=64m"}, + ), + ) + + try: + result = await loop.run_in_executor( + None, lambda: container.wait(timeout=self.timeout) + ) + logs = await loop.run_in_executor( + None, lambda: container.logs().decode("utf-8", errors="replace") + ) + + if result["StatusCode"] == 0: + return self._parse_test_results(logs) + return 0.0 + finally: + await loop.run_in_executor(None, lambda: container.remove(force=True)) + + except Exception as e: + logger.error("Sandbox execution failed: %s", e) + return 0.0 + + async def lint(self, code_b64: str) -> float: + """Run static analysis on code. Returns quality score 0.0-1.0.""" + try: + code = base64.b64decode(code_b64).decode("utf-8") + except Exception: + return 0.0 + + # Basic static analysis without Docker + score = 0.5 # Base score + lines = code.split("\n") + + # Check for docstrings + if '"""' in code or "'''" in code: + score += 0.1 + + # Check for type hints + if ":" in code and "->" in code: + score += 0.1 + + # Check for error handling + if "try:" in code or "except" in code: + score += 0.1 + + # Penalize very short code + if len(lines) < 5: + score -= 0.2 + + # Penalize no functions/classes + if "def " not in code and "class " not in code: + score -= 0.1 + + return max(0.0, min(1.0, score)) + + def _contains_dangerous_patterns(self, code: str) -> bool: + """Check for obviously dangerous code patterns.""" + dangerous = [ + r"import\s+subprocess", + r"import\s+shutil", + r"os\.system\(", + r"os\.exec", + r"os\.popen\(", + r"__import__\(", + r"eval\(", + r"exec\(", + r"open\(.*/etc/", + r"fork\(\)", + r"import\s+socket", + ] + for pattern in dangerous: + if re.search(pattern, code): + return True + return False + + def _parse_test_results(self, logs: str) -> float: + """Parse test output for pass/fail counts.""" + # Try pytest format + match = re.search(r"(\d+)\s+passed", logs) + if match: + passed = int(match.group(1)) + failed_match = re.search(r"(\d+)\s+failed", logs) + failed = int(failed_match.group(1)) if failed_match else 0 + total = passed + failed + return passed / total if total > 0 else 0.0 + + # Try unittest format + match = re.search(r"Ran\s+(\d+)\s+test", logs) + if match: + total = int(match.group(1)) + if "OK" in logs: + return 1.0 + fail_match = re.search(r"failures=(\d+)", logs) + failures = int(fail_match.group(1)) if fail_match else 0 + return max(0.0, (total - failures) / total) if total > 0 else 0.0 + + # Fallback: if code ran without error, partial credit + return 0.7 diff --git a/reasonforge/verification/fact_checker.py b/reasonforge/verification/fact_checker.py new file mode 100644 index 0000000..eb3c754 --- /dev/null +++ b/reasonforge/verification/fact_checker.py @@ -0,0 +1,174 @@ +""" +ReasonForge - Fact Checker + +Citation and factual claim verification for scientific and ethical domains. +""" + +from __future__ import annotations + +import logging +import re +from typing import Dict + +logger = logging.getLogger("reasonforge.verification.fact") + + +class FactChecker: + """Verify citations and factual claims in miner submissions.""" + + # Known facts database (expandable) + KNOWN_FACTS: Dict[str, str] = { + "speed of light": "299792458 m/s", + "planck constant": "6.626e-34 J*s", + "avogadro number": "6.022e23", + "boltzmann constant": "1.381e-23 J/K", + "gravitational constant": "6.674e-11 N*m^2/kg^2", + "electron mass": "9.109e-31 kg", + "proton mass": "1.673e-27 kg", + "pi": "3.14159265358979", + "euler number": "2.71828182845905", + "golden ratio": "1.61803398874989", + "ph of pure water": "7.0", + "absolute zero": "-273.15 C", + "boiling point of water": "100 C", + } + + def verify_claims(self, text: str) -> float: + """ + Verify factual claims in text. + + Returns: + Score 0.0-1.0 based on claim accuracy. + """ + if not text: + return 0.5 + + claims_found = 0 + claims_verified = 0 + + text_lower = text.lower() + + for fact_key, fact_value in self.KNOWN_FACTS.items(): + if fact_key in text_lower: + claims_found += 1 + # Check if the correct value appears near the fact mention + if self._value_matches(text_lower, fact_key, fact_value): + claims_verified += 1 + + if claims_found == 0: + return 0.5 # No verifiable claims + + return claims_verified / claims_found + + def check_citations(self, text: str) -> float: + """ + Check for presence and format of citations. + + Returns: + Score 0.0-1.0 based on citation quality. + """ + if not text: + return 0.0 + + score = 0.0 + + # Check for standard citation formats + # APA-like: (Author, Year) + apa_citations = re.findall( + r"\([A-Z][a-z]+(?:\s+(?:et\s+al\.?|&\s+[A-Z][a-z]+))?,\s*\d{4}\)", text + ) + if apa_citations: + score += min(0.4, len(apa_citations) * 0.1) + + # Numbered citations: [1], [2], etc. + numbered_citations = re.findall(r"\[\d+\]", text) + if numbered_citations: + score += min(0.3, len(numbered_citations) * 0.05) + + # General reference indicators + ref_indicators = [ + "according to", + "study shows", + "research indicates", + "et al.", + "published in", + "journal of", + ] + for indicator in ref_indicators: + if indicator.lower() in text.lower(): + score += 0.05 + + return min(1.0, score) + + def verify_scientific_claims(self, domain: str, text: str) -> float: + """ + Domain-specific scientific claim verification. + """ + if not text: + return 0.0 + + text_lower = text.lower() + score = 0.3 # Base score for having content + + # Check for methodological rigor + methodology_keywords = [ + "hypothesis", + "method", + "result", + "conclusion", + "control", + "variable", + "experiment", + "observation", + "data", + "analysis", + "significant", + "evidence", + ] + found = sum(1 for kw in methodology_keywords if kw in text_lower) + score += min(0.4, found * 0.05) + + # Check for quantitative content + numbers = re.findall(r"\d+\.?\d*", text) + if numbers: + score += min(0.2, len(numbers) * 0.02) + + # Check for units + units = [ + "kg", + "m/s", + "mol", + "kelvin", + "joule", + "watt", + "newton", + "pascal", + "hertz", + "volt", + "ampere", + ] + for unit in units: + if unit in text_lower: + score += 0.03 + + return min(1.0, score) + + def _value_matches(self, text: str, fact_key: str, fact_value: str) -> bool: + """Check if the correct value appears near a fact mention.""" + # Find the position of the fact mention + pos = text.find(fact_key) + if pos < 0: + return False + + # Check a window around the mention + window_start = max(0, pos - 200) + window_end = min(len(text), pos + len(fact_key) + 200) + window = text[window_start:window_end] + + # Extract the numeric part of the expected value + expected_nums = re.findall(r"-?\d+\.?\d*(?:e[+-]?\d+)?", fact_value.lower()) + for expected in expected_nums: + if expected in window: + return True + + return False diff --git a/reasonforge/verification/lean4_checker.py b/reasonforge/verification/lean4_checker.py new file mode 100644 index 0000000..7510c87 --- /dev/null +++ b/reasonforge/verification/lean4_checker.py @@ -0,0 +1,106 @@ +""" +ReasonForge - Lean 4 Proof Checker + +Verifies Lean 4 proof artifacts submitted by miners. +Requires: lean4 toolchain installed in validator environment. +""" + +from __future__ import annotations + +import asyncio +import base64 +import logging +import os +import tempfile +from typing import Optional + +logger = logging.getLogger("reasonforge.verification.lean4") + + +class Lean4Checker: + """Verify Lean 4 proof artifacts.""" + + def __init__(self, lean_path: str = "lean", timeout: int = 60): + self.lean_path = lean_path + self.timeout = timeout + self._available: Optional[bool] = None + + async def is_available(self) -> bool: + """Check if Lean 4 is installed and available.""" + if self._available is not None: + return self._available + try: + proc = await asyncio.create_subprocess_exec( + self.lean_path, + "--version", + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, _ = await asyncio.wait_for(proc.communicate(), timeout=10) + self._available = proc.returncode == 0 + except (FileNotFoundError, asyncio.TimeoutError): + self._available = False + logger.info("Lean 4 available: %s", self._available) + return self._available + + async def verify(self, proof_b64: str) -> float: + """ + Decode proof artifact -> write to temp .lean file -> run lean4 -> check exit code. + + Returns: + 1.0 if proof compiles successfully + 0.0 if proof fails to compile + 0.5 if timeout or other error + """ + if not await self.is_available(): + logger.debug("Lean 4 not available, returning neutral score") + return 0.5 + + try: + proof_text = base64.b64decode(proof_b64).decode("utf-8") + except Exception as e: + logger.warning("Failed to decode proof artifact: %s", e) + return 0.0 + + # Validate proof text isn't too large + if len(proof_text) > 100_000: + logger.warning("Proof artifact too large (%d bytes)", len(proof_text)) + return 0.0 + + tmpfile = None + try: + # Write to temp file + tmpfile = tempfile.NamedTemporaryFile(suffix=".lean", mode="w", delete=False) + tmpfile.write(proof_text) + tmpfile.flush() + tmpfile.close() + + # Run lean4 + proc = await asyncio.create_subprocess_exec( + self.lean_path, + tmpfile.name, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=self.timeout) + + if proc.returncode == 0: + logger.info("Proof verified successfully") + return 1.0 + else: + error_msg = stderr.decode("utf-8", errors="replace")[:500] + logger.info("Proof verification failed: %s", error_msg) + return 0.0 + + except asyncio.TimeoutError: + logger.warning("Lean 4 verification timed out after %ds", self.timeout) + return 0.5 + except Exception as e: + logger.error("Lean 4 verification error: %s", e) + return 0.5 + finally: + if tmpfile and os.path.exists(tmpfile.name): + try: + os.unlink(tmpfile.name) + except OSError: + pass diff --git a/reasonforge/verification/math_checker.py b/reasonforge/verification/math_checker.py new file mode 100644 index 0000000..34409f0 --- /dev/null +++ b/reasonforge/verification/math_checker.py @@ -0,0 +1,147 @@ +""" +ReasonForge - Mathematical Verification + +Numerical and symbolic verification using SymPy. +""" + +from __future__ import annotations + +import logging +import re +from typing import Optional + +logger = logging.getLogger("reasonforge.verification.math") + + +class MathChecker: + """Numerical and symbolic verification using SymPy.""" + + def __init__(self): + self._sympy_available: Optional[bool] = None + + def _check_sympy(self) -> bool: + if self._sympy_available is None: + try: + import sympy # noqa: F401 + + self._sympy_available = True + except ImportError: + self._sympy_available = False + logger.warning("SymPy not available, math verification limited") + return self._sympy_available + + def verify(self, problem: str, answer: str) -> float: + """ + Verify a mathematical answer against the problem. + + Returns: + 1.0 for verified correct + 0.0 for verified incorrect + 0.5 for unverifiable + """ + if not answer or not answer.strip(): + return 0.0 + + # Try numeric extraction and verification + numeric_result = self._verify_numeric(problem, answer) + if numeric_result is not None: + return numeric_result + + # Try symbolic verification if SymPy is available + if self._check_sympy(): + symbolic_result = self._verify_symbolic(problem, answer) + if symbolic_result is not None: + return symbolic_result + + # Can't verify: return neutral + return 0.5 + + def _verify_numeric(self, problem: str, answer: str) -> Optional[float]: + """Try to extract and verify numeric answers.""" + problem_lower = problem.lower() + answer_lower = answer.lower().strip() + + # Known simple problems + simple_checks = { + "2+2": ("4", 1.0), + "2 + 2": ("4", 1.0), + "3 + 5": ("8", 1.0), + "derivative of x^3": ("3x^2", 1.0), + "integral of 2x": ("x^2", 1.0), + } + + for pattern, (expected, score) in simple_checks.items(): + if pattern in problem_lower: + if expected in answer_lower: + return score + return 0.0 + + # Extract numbers from the answer + numbers = re.findall(r"-?\d+\.?\d*", answer_lower) + if not numbers: + return None + + return None # Can't determine correctness from numbers alone + + def _verify_symbolic(self, problem: str, answer: str) -> Optional[float]: + """Try symbolic verification using SymPy.""" + try: + from sympy import Eq, simplify, sympify # noqa: F401 + from sympy.parsing.sympy_parser import parse_expr + + # Try to parse the answer as a SymPy expression + answer_clean = answer.strip() + + # Remove common text around the answer + for prefix in ["=", "is", "equals", "the answer is"]: + if answer_clean.lower().startswith(prefix): + answer_clean = answer_clean[len(prefix) :].strip() + + # Try to parse + try: + _expr = parse_expr(answer_clean) # noqa: F841 + # If we can parse it, it's at least mathematically valid + return 0.6 + except Exception: + pass + + return None + + except ImportError: + return None + except Exception: + return None + + def verify_equation(self, lhs: str, rhs: str) -> float: + """Verify that two expressions are equal.""" + if not self._check_sympy(): + return 0.5 + + try: + from sympy import simplify, sympify + + lhs_expr = sympify(lhs) + rhs_expr = sympify(rhs) + diff = simplify(lhs_expr - rhs_expr) + return 1.0 if diff == 0 else 0.0 + except Exception: + return 0.5 + + def verify_inequality(self, expr: str, bound: float, direction: str = "gt") -> float: + """Verify that an expression satisfies a bound.""" + if not self._check_sympy(): + return 0.5 + + try: + from sympy import sympify + + val = float(sympify(expr)) + if direction == "gt": + return 1.0 if val > bound else 0.0 + elif direction == "lt": + return 1.0 if val < bound else 0.0 + elif direction == "eq": + return 1.0 if abs(val - bound) < 1e-6 else 0.0 + return 0.5 + except Exception: + return 0.5 diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..fd2b69d --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,9 @@ +# ReasonForge Development Dependencies +pytest>=7.0.0 +pytest-asyncio>=0.21.0 +pytest-timeout>=2.1.0 +ruff>=0.1.0 +mypy>=1.6.0 + +# Testing +httpx>=0.24.0 diff --git a/requirements-miner.txt b/requirements-miner.txt new file mode 100644 index 0000000..c740bbd --- /dev/null +++ b/requirements-miner.txt @@ -0,0 +1,13 @@ +# ReasonForge Miner Dependencies +bittensor>=10.0.1 +torch>=2.0.0 +numpy>=1.24.0 +pydantic>=2.0.0 + +# LLM Backends (install as needed) +openai>=1.0.0 +anthropic>=0.20.0 +# transformers>=4.35.0 # For local backend +# vllm>=0.3.0 # For fast local inference +# langchain-openai>=0.1.0 # For agent backend +# langgraph>=0.0.10 # For agent backend diff --git a/requirements-validator.txt b/requirements-validator.txt new file mode 100644 index 0000000..95c3128 --- /dev/null +++ b/requirements-validator.txt @@ -0,0 +1,14 @@ +# ReasonForge Validator Dependencies +bittensor>=10.0.1 +torch>=2.0.0 +numpy>=1.24.0 +pydantic>=2.0.0 + +# Verification & Scoring +sentence-transformers>=2.2.0 +sympy>=1.12 +docker>=7.0.0 + +# Monitoring +prometheus-client>=0.19.0 +structlog>=23.0.0 diff --git a/requirements.txt b/requirements.txt index 5902315..bf30b24 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,7 @@ +# ReasonForge Core Dependencies +pydantic>=2.0.0 +numpy>=1.24.0 fastapi>=0.100.0 uvicorn>=0.23.0 pytest>=7.0.0 +pytest-asyncio>=0.21.0 diff --git a/scripts/register_neurons.sh b/scripts/register_neurons.sh new file mode 100644 index 0000000..814672f --- /dev/null +++ b/scripts/register_neurons.sh @@ -0,0 +1,65 @@ +#!/bin/bash +set -e + +# ============================================================ +# ReasonForge - Register Neurons Script +# Registers miner and validator UIDs on a subnet +# ============================================================ + +WALLET_NAME="${WALLET_NAME:-owner}" +NETUID="${NETUID:-1}" +SUBTENSOR_NETWORK="${SUBTENSOR_NETWORK:-test}" +SUBTENSOR_CHAIN_ENDPOINT="${SUBTENSOR_CHAIN_ENDPOINT:-}" + +echo "============================================" +echo " ReasonForge Neuron Registration" +echo "============================================" +echo "" + +# Check if btcli is installed +if ! command -v btcli &> /dev/null; then + echo "ERROR: btcli is not installed. Install it with:" + echo " pip install bittensor" + exit 1 +fi + +# Build chain endpoint args +CHAIN_ARGS="" +if [ -n "${SUBTENSOR_CHAIN_ENDPOINT}" ]; then + CHAIN_ARGS="--subtensor.chain_endpoint ${SUBTENSOR_CHAIN_ENDPOINT}" +fi + +echo "Network: ${SUBTENSOR_NETWORK}" +echo "Wallet: ${WALLET_NAME}" +echo "Subnet UID: ${NETUID}" +echo "" + +# Register miner +echo "[1/2] Registering miner on subnet ${NETUID}..." +btcli subnet register \ + --wallet.name "${WALLET_NAME}" \ + --wallet.hotkey miner \ + --netuid "${NETUID}" \ + --subtensor.network "${SUBTENSOR_NETWORK}" \ + ${CHAIN_ARGS} \ + --no_prompt + +echo "" + +# Register validator +echo "[2/2] Registering validator on subnet ${NETUID}..." +btcli subnet register \ + --wallet.name "${WALLET_NAME}" \ + --wallet.hotkey validator \ + --netuid "${NETUID}" \ + --subtensor.network "${SUBTENSOR_NETWORK}" \ + ${CHAIN_ARGS} \ + --no_prompt + +echo "" +echo "============================================" +echo " Neuron registration complete!" +echo "============================================" +echo "" +echo "Verify registrations:" +echo " btcli subnet metagraph --netuid ${NETUID} --subtensor.network ${SUBTENSOR_NETWORK} ${CHAIN_ARGS}" diff --git a/scripts/register_subnet.sh b/scripts/register_subnet.sh new file mode 100644 index 0000000..12546d4 --- /dev/null +++ b/scripts/register_subnet.sh @@ -0,0 +1,53 @@ +#!/bin/bash +set -e + +# ============================================================ +# ReasonForge - Register Subnet Script +# Registers a new subnet on testnet +# ============================================================ + +WALLET_NAME="${WALLET_NAME:-owner}" +SUBTENSOR_NETWORK="${SUBTENSOR_NETWORK:-test}" +SUBTENSOR_CHAIN_ENDPOINT="${SUBTENSOR_CHAIN_ENDPOINT:-}" + +echo "============================================" +echo " ReasonForge Subnet Registration" +echo "============================================" +echo "" + +# Check if btcli is installed +if ! command -v btcli &> /dev/null; then + echo "ERROR: btcli is not installed. Install it with:" + echo " pip install bittensor" + exit 1 +fi + +# Build chain endpoint args +CHAIN_ARGS="" +if [ -n "${SUBTENSOR_CHAIN_ENDPOINT}" ]; then + CHAIN_ARGS="--subtensor.chain_endpoint ${SUBTENSOR_CHAIN_ENDPOINT}" +fi + +echo "Network: ${SUBTENSOR_NETWORK}" +echo "Wallet: ${WALLET_NAME}" +echo "" + +# Register subnet +echo "[1/1] Registering new subnet on ${SUBTENSOR_NETWORK}..." +btcli subnet create \ + --wallet.name "${WALLET_NAME}" \ + --subtensor.network "${SUBTENSOR_NETWORK}" \ + ${CHAIN_ARGS} \ + --no_prompt + +echo "" +echo "============================================" +echo " Subnet registration complete!" +echo "============================================" +echo "" +echo "List subnets to find your NETUID:" +echo " btcli subnet list --subtensor.network ${SUBTENSOR_NETWORK} ${CHAIN_ARGS}" +echo "" +echo "Next steps:" +echo " 1. Note your NETUID" +echo " 2. Register neurons: NETUID= ./scripts/register_neurons.sh" diff --git a/scripts/run_localnet.sh b/scripts/run_localnet.sh new file mode 100644 index 0000000..bbed565 --- /dev/null +++ b/scripts/run_localnet.sh @@ -0,0 +1,124 @@ +#!/bin/bash +set -e + +# ============================================================ +# ReasonForge - Local Network Setup Script +# Starts local subtensor, creates wallets, funds them, +# registers subnet, registers neurons, and stakes +# ============================================================ + +WALLET_NAME="${WALLET_NAME:-owner}" +STAKE_AMOUNT="${STAKE_AMOUNT:-1000}" +SUBTENSOR_CHAIN_ENDPOINT="${SUBTENSOR_CHAIN_ENDPOINT:-ws://127.0.0.1:9944}" + +echo "============================================" +echo " ReasonForge Local Network Setup" +echo "============================================" +echo "" + +# Check prerequisites +for cmd in btcli docker; do + if ! command -v "$cmd" &> /dev/null; then + echo "ERROR: ${cmd} is not installed." + exit 1 + fi +done + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_DIR="$(dirname "${SCRIPT_DIR}")" + +# ---- Step 1: Start local subtensor ---- +echo "[1/6] Starting local subtensor..." +docker run -d \ + --name subtensor-local \ + --rm \ + -p 9944:9944 \ + -p 9933:9933 \ + -p 30333:30333 \ + opentensor/subtensor:latest \ + --dev \ + --ws-external \ + --rpc-external \ + --rpc-cors all 2>/dev/null || echo " -> subtensor-local already running." + +echo " Waiting for subtensor to be ready..." +sleep 10 + +CHAIN_ARGS="--subtensor.chain_endpoint ${SUBTENSOR_CHAIN_ENDPOINT}" + +# ---- Step 2: Create wallets ---- +echo "" +echo "[2/6] Creating wallets..." +btcli wallet new_coldkey --wallet.name "${WALLET_NAME}" --no_prompt 2>/dev/null || \ + echo " -> Coldkey '${WALLET_NAME}' already exists." +btcli wallet new_hotkey --wallet.name "${WALLET_NAME}" --wallet.hotkey miner --no_prompt 2>/dev/null || \ + echo " -> Hotkey 'miner' already exists." +btcli wallet new_hotkey --wallet.name "${WALLET_NAME}" --wallet.hotkey validator --no_prompt 2>/dev/null || \ + echo " -> Hotkey 'validator' already exists." + +# ---- Step 3: Fund wallets from faucet ---- +echo "" +echo "[3/6] Funding wallets from faucet..." +btcli wallet faucet \ + --wallet.name "${WALLET_NAME}" \ + --subtensor.network local \ + ${CHAIN_ARGS} \ + --no_prompt 2>/dev/null || echo " -> Faucet funding attempted (may require manual funding)." + +# ---- Step 4: Register subnet ---- +echo "" +echo "[4/6] Registering subnet..." +btcli subnet create \ + --wallet.name "${WALLET_NAME}" \ + --subtensor.network local \ + ${CHAIN_ARGS} \ + --no_prompt 2>/dev/null || echo " -> Subnet may already exist." + +NETUID=1 +echo " Using NETUID=${NETUID}" + +# ---- Step 5: Register neurons ---- +echo "" +echo "[5/6] Registering neurons on subnet ${NETUID}..." +btcli subnet register \ + --wallet.name "${WALLET_NAME}" \ + --wallet.hotkey miner \ + --netuid "${NETUID}" \ + --subtensor.network local \ + ${CHAIN_ARGS} \ + --no_prompt 2>/dev/null || echo " -> Miner may already be registered." + +btcli subnet register \ + --wallet.name "${WALLET_NAME}" \ + --wallet.hotkey validator \ + --netuid "${NETUID}" \ + --subtensor.network local \ + ${CHAIN_ARGS} \ + --no_prompt 2>/dev/null || echo " -> Validator may already be registered." + +# ---- Step 6: Stake TAO for validator ---- +echo "" +echo "[6/6] Staking ${STAKE_AMOUNT} TAO for validator..." +btcli stake add \ + --wallet.name "${WALLET_NAME}" \ + --wallet.hotkey validator \ + --amount "${STAKE_AMOUNT}" \ + --subtensor.network local \ + ${CHAIN_ARGS} \ + --no_prompt 2>/dev/null || echo " -> Staking attempted (may need more funds)." + +echo "" +echo "============================================" +echo " Local network setup complete!" +echo "============================================" +echo "" +echo "Subtensor: ${SUBTENSOR_CHAIN_ENDPOINT}" +echo "Subnet: ${NETUID}" +echo "Wallet: ${WALLET_NAME}" +echo "" +echo "Start services:" +echo " cd ${PROJECT_DIR}/docker" +echo " docker compose -f docker-compose.localnet.yml up --build" +echo "" +echo "To stop local subtensor:" +echo " docker stop subtensor-local" diff --git a/scripts/setup_wallets.sh b/scripts/setup_wallets.sh new file mode 100644 index 0000000..da6697c --- /dev/null +++ b/scripts/setup_wallets.sh @@ -0,0 +1,48 @@ +#!/bin/bash +set -e + +# ============================================================ +# ReasonForge - Wallet Setup Script +# Creates owner, miner, and validator wallets using btcli +# ============================================================ + +WALLET_NAME="${WALLET_NAME:-owner}" + +echo "============================================" +echo " ReasonForge Wallet Setup" +echo "============================================" +echo "" + +# Check if btcli is installed +if ! command -v btcli &> /dev/null; then + echo "ERROR: btcli is not installed. Install it with:" + echo " pip install bittensor" + exit 1 +fi + +# Create owner coldkey +echo "[1/3] Creating owner coldkey: ${WALLET_NAME}" +btcli wallet new_coldkey --wallet.name "${WALLET_NAME}" --no_prompt 2>/dev/null || \ + echo " -> Coldkey '${WALLET_NAME}' already exists, skipping." + +# Create miner hotkey +echo "[2/3] Creating miner hotkey: miner" +btcli wallet new_hotkey --wallet.name "${WALLET_NAME}" --wallet.hotkey miner --no_prompt 2>/dev/null || \ + echo " -> Hotkey 'miner' already exists, skipping." + +# Create validator hotkey +echo "[3/3] Creating validator hotkey: validator" +btcli wallet new_hotkey --wallet.name "${WALLET_NAME}" --wallet.hotkey validator --no_prompt 2>/dev/null || \ + echo " -> Hotkey 'validator' already exists, skipping." + +echo "" +echo "============================================" +echo " Wallet setup complete!" +echo "============================================" +echo "" +echo "Wallets created:" +btcli wallet list --wallet.name "${WALLET_NAME}" 2>/dev/null || true +echo "" +echo "Next steps:" +echo " 1. Fund the coldkey with TAO" +echo " 2. Register on a subnet: ./scripts/register_neurons.sh" diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..b0bd822 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,95 @@ +""" +ReasonForge - Shared Test Fixtures + +Common fixtures for all test modules. +""" + +import os +import sys +import tempfile + +import pytest + +# Ensure project root is in path +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) + + +@pytest.fixture +def temp_db_path(): + """Provide a temporary database path.""" + with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f: + path = f.name + yield path + try: + os.unlink(path) + except OSError: + pass + + +@pytest.fixture +def sample_task(): + """Create a sample task for testing.""" + from reasonforge.types import Domain, Task, TaskSource + + return Task( + task_id="test-task-001", + problem="Prove that sqrt(2) is irrational.", + domain=Domain.MATHEMATICS, + difficulty=5, + timeout_seconds=300, + source=TaskSource.SYNTHETIC, + ) + + +@pytest.fixture +def sample_trap_task(): + """Create a sample trap task for testing.""" + from reasonforge.types import Domain, Task, TaskSource + + return Task( + task_id="test-trap-001", + problem="What is 2+2?", + domain=Domain.MATHEMATICS, + difficulty=2, + source=TaskSource.TRAP, + is_trap=True, + ground_truth_score=1.0, + ) + + +@pytest.fixture +def sample_reasoning_response(): + """Create a sample ReasoningTask response.""" + from reasonforge.protocol import ReasoningTask + + return ReasoningTask( + task_id="test-task-001", + problem="Prove that sqrt(2) is irrational.", + domain="mathematics", + difficulty=5, + reasoning_steps=[ + { + "step_id": 0, + "reasoning": "Assume sqrt(2) is rational, so sqrt(2) = p/q where p,q are integers with no common factors.", + "evidence": "Proof by contradiction setup", + "confidence": 0.9, + "formal_proof_fragment": None, + }, + { + "step_id": 1, + "reasoning": "Then 2 = p^2/q^2, so p^2 = 2q^2. This means p^2 is even, so p is even.", + "evidence": "If p^2 is even then p is even (contrapositive: odd^2 is odd)", + "confidence": 0.95, + "formal_proof_fragment": None, + }, + { + "step_id": 2, + "reasoning": "Let p = 2k. Then 4k^2 = 2q^2, so q^2 = 2k^2. Thus q is also even. Contradiction.", + "evidence": "Both p and q are even contradicts our assumption of no common factors.", + "confidence": 0.95, + "formal_proof_fragment": None, + }, + ], + final_answer="sqrt(2) is irrational by proof by contradiction.", + time_taken_ms=5000, + ) diff --git a/tests/test_embeddings.py b/tests/test_embeddings.py new file mode 100644 index 0000000..0a7ac0d --- /dev/null +++ b/tests/test_embeddings.py @@ -0,0 +1,72 @@ +""" +ReasonForge - Embedding Tests + +Tests for similarity detection using sentence embeddings. +Note: Actual embedding model tests require sentence-transformers installed. +""" + +import numpy as np + + +class TestSimilarityDetector: + """Test embedding-based similarity detection.""" + + def test_import(self): + """Test that the module can be imported.""" + from reasonforge.embeddings.similarity import SimilarityDetector + + detector = SimilarityDetector.__new__(SimilarityDetector) + assert detector is not None + + def test_cosine_similarity_identical(self): + from reasonforge.embeddings.similarity import SimilarityDetector + + detector = SimilarityDetector.__new__(SimilarityDetector) + # Identical normalized vectors should have similarity 1.0 + a = np.array([1.0, 0.0, 0.0]) + b = np.array([1.0, 0.0, 0.0]) + sim = detector.cosine_similarity(a, b) + assert abs(sim - 1.0) < 1e-6 + + def test_cosine_similarity_orthogonal(self): + from reasonforge.embeddings.similarity import SimilarityDetector + + detector = SimilarityDetector.__new__(SimilarityDetector) + a = np.array([1.0, 0.0, 0.0]) + b = np.array([0.0, 1.0, 0.0]) + sim = detector.cosine_similarity(a, b) + assert abs(sim) < 1e-6 + + def test_cosine_similarity_opposite(self): + from reasonforge.embeddings.similarity import SimilarityDetector + + detector = SimilarityDetector.__new__(SimilarityDetector) + a = np.array([1.0, 0.0]) + b = np.array([-1.0, 0.0]) + sim = detector.cosine_similarity(a, b) + assert abs(sim - (-1.0)) < 1e-6 + + def test_empty_history_check(self): + from reasonforge.embeddings.similarity import SimilarityDetector + + detector = SimilarityDetector.__new__(SimilarityDetector) + detector.history_embeddings = [] + + class FakeResponse: + reasoning_steps = [{"reasoning": "test"}] + final_answer = "test" + + sim = detector.check_against_history(FakeResponse()) + assert sim == 0.0 + + def test_empty_batch_check(self): + from reasonforge.embeddings.similarity import SimilarityDetector + + detector = SimilarityDetector.__new__(SimilarityDetector) + + class FakeResponse: + reasoning_steps = [{"reasoning": "test"}] + final_answer = "test" + + sim = detector.check_against_batch(FakeResponse(), []) + assert sim == 0.0 diff --git a/tests/test_engine.py b/tests/test_engine.py index fbfffb3..61bc267 100644 --- a/tests/test_engine.py +++ b/tests/test_engine.py @@ -6,16 +6,15 @@ """ import math -import pytest -from reasonforge.types import DimensionScores, MinerState, ValidatorState from reasonforge.engine import ScoringEngine - +from reasonforge.types import DimensionScores, MinerState, ValidatorState # ────────────────────────────────────────────── # Eq. 2 — CMS # ────────────────────────────────────────────── + class TestCMS: def test_cms_computation(self): """Eq.2: CMS = 0.40*Q + 0.30*A + 0.15*N + 0.15*Eff""" @@ -47,6 +46,7 @@ def test_cms_matches_property(self): # Eq. 3 — Epoch Score # ────────────────────────────────────────────── + class TestSEpoch: def test_s_epoch_basic(self): """Eq.3: S_epoch with known values""" @@ -75,6 +75,7 @@ def test_s_epoch_empty(self): # Eq. 4 — PEB # ────────────────────────────────────────────── + class TestPEB: def test_peb_rank1_streak4(self): """Eq.4: PEB = 0.20 * (1/1) * sqrt(4) = 0.40""" @@ -112,6 +113,7 @@ def test_peb_rank0_invalid(self): # Eq. 5 — Miner Emission Distribution # ────────────────────────────────────────────── + class TestMinerEmissions: def test_emission_conservation(self): """Eq.5: sum(rewards) == pool""" @@ -149,6 +151,7 @@ def test_emission_all_zero(self): # Eq. 6 — Breakthrough # ────────────────────────────────────────────── + class TestBreakthrough: def test_breakthrough_applied(self): """Eq.6: CMS=0.9 on unsolved task -> 0.9 * 2.0 = 1.8""" @@ -175,6 +178,7 @@ def test_breakthrough_at_threshold(self): # Eq. 7 — VAS # ────────────────────────────────────────────── + class TestVAS: def test_vas_perfect(self): """Eq.7: all scores match consensus -> VAS = 1.0""" @@ -200,13 +204,20 @@ def test_vas_empty(self): # Eq. 8 — Validator Emission Distribution # ────────────────────────────────────────────── + class TestValidatorEmissions: def test_validator_emission_conservation(self): """Eq.8: sum(validator rewards) == pool""" validators = [ - ValidatorState(validator_id="v1", stake=5000, current_vas=0.95, reputation_multiplier=1.4), - ValidatorState(validator_id="v2", stake=3000, current_vas=0.90, reputation_multiplier=1.2), - ValidatorState(validator_id="v3", stake=1000, current_vas=0.70, reputation_multiplier=1.0), + ValidatorState( + validator_id="v1", stake=5000, current_vas=0.95, reputation_multiplier=1.4 + ), + ValidatorState( + validator_id="v2", stake=3000, current_vas=0.90, reputation_multiplier=1.2 + ), + ValidatorState( + validator_id="v3", stake=1000, current_vas=0.70, reputation_multiplier=1.0 + ), ] pool = 10.0 rewards = ScoringEngine.distribute_validator_emissions(validators, pool) @@ -215,8 +226,12 @@ def test_validator_emission_conservation(self): def test_validator_higher_stake_more_reward(self): """Eq.8: higher stake*VAS*rep -> more reward""" validators = [ - ValidatorState(validator_id="v1", stake=5000, current_vas=0.95, reputation_multiplier=1.0), - ValidatorState(validator_id="v2", stake=1000, current_vas=0.95, reputation_multiplier=1.0), + ValidatorState( + validator_id="v1", stake=5000, current_vas=0.95, reputation_multiplier=1.0 + ), + ValidatorState( + validator_id="v2", stake=1000, current_vas=0.95, reputation_multiplier=1.0 + ), ] rewards = ScoringEngine.distribute_validator_emissions(validators, 10.0) assert rewards[0] > rewards[1] @@ -226,6 +241,7 @@ def test_validator_higher_stake_more_reward(self): # Eq. 9 — Trap Penalty # ────────────────────────────────────────────── + class TestTrapPenalty: def test_trap_above_threshold(self): """Eq.9: avg >= theta_trap -> penalty = 1.0""" @@ -250,6 +266,7 @@ def test_trap_empty(self): # Eq. 10 — Slash # ────────────────────────────────────────────── + class TestSlash: def test_slash_below_threshold(self): """Eq.10: VAS_avg=0.40 -> slash = gamma * stake * (0.60-0.40)^2""" @@ -272,6 +289,7 @@ def test_slash_at_threshold(self): # Eq. 11 — Objective Score # ────────────────────────────────────────────── + class TestObjectiveScore: def test_objective_score(self): """Eq.11: weighted sum of checks""" @@ -293,6 +311,7 @@ def test_objective_missing_check(self): # Eq. 12 — Consensus Score # ────────────────────────────────────────────── + class TestConsensusScore: def test_consensus_trimmed_median(self): """Eq.12: verify trimming works with 5+ validators""" @@ -313,7 +332,9 @@ def test_consensus_trimmed_median(self): # result = (0.7*2000/9000 + 0.8*3000/9000 + 0.85*2000/9000) / (7000/9000) # = (1400 + 2400 + 1700) / 7000 = 5500/7000 = 0.78571... total_stake = 9000 - numerator = 0.7 * (2000 / total_stake) + 0.8 * (3000 / total_stake) + 0.85 * (2000 / total_stake) + numerator = ( + 0.7 * (2000 / total_stake) + 0.8 * (3000 / total_stake) + 0.85 * (2000 / total_stake) + ) denom = (2000 + 3000 + 2000) / total_stake expected = numerator / denom assert abs(result - expected) < 1e-6 @@ -338,6 +359,7 @@ def test_consensus_empty(self): # Eq. 13 — Final Score # ────────────────────────────────────────────── + class TestFinalScore: def test_final_score(self): """Eq.13: FinalScore = 0.60*O + 0.40*C""" diff --git a/tests/test_gateway.py b/tests/test_gateway.py new file mode 100644 index 0000000..587ac42 --- /dev/null +++ b/tests/test_gateway.py @@ -0,0 +1,130 @@ +""" +ReasonForge - Gateway Tests + +Tests for API gateway endpoints, authentication, and rate limiting. +""" + +import pytest + +from reasonforge.gateway.auth import TIER_LIMITS, APIKeyManager +from reasonforge.gateway.billing import BillingTracker +from reasonforge.gateway.rate_limiter import PerIPRateLimiter, TokenBucketRateLimiter +from reasonforge.gateway.schemas import HealthResponse, TaskSubmissionRequest + + +class TestAPIKeyManager: + """Test API key management.""" + + def test_create_key(self): + mgr = APIKeyManager() + key = mgr.create_key("test_user", "free") + assert key.startswith("rf_") + assert len(key) > 10 + + def test_verify_key_no_db(self): + mgr = APIKeyManager() + info = mgr.verify_key("rf_some_key_value") + assert info is not None + assert info.tier == "free" + + def test_verify_invalid_key(self): + mgr = APIKeyManager() + info = mgr.verify_key("invalid_key") + assert info is None + + def test_tier_limits(self): + assert TIER_LIMITS["free"] == 100 + assert TIER_LIMITS["pro"] == 10_000 + assert TIER_LIMITS["enterprise"] == 1_000_000 + + +class TestTokenBucketRateLimiter: + """Test token bucket rate limiting.""" + + def test_allow_within_limit(self): + limiter = TokenBucketRateLimiter(rate=100, capacity=100) + assert limiter.allow("test") is True + + def test_burst_capacity(self): + limiter = TokenBucketRateLimiter(rate=1, capacity=5) + # Should allow burst of 5 + for _ in range(5): + assert limiter.allow("test") is True + # 6th should fail + assert limiter.allow("test") is False + + def test_retry_after(self): + limiter = TokenBucketRateLimiter(rate=1, capacity=1) + limiter.allow("test") # Use the token + retry = limiter.get_retry_after("test") + assert retry >= 0.0 + + def test_different_keys(self): + limiter = TokenBucketRateLimiter(rate=1, capacity=1) + assert limiter.allow("key1") is True + assert limiter.allow("key2") is True # Different key, separate bucket + + +class TestPerIPRateLimiter: + """Test per-IP rate limiting.""" + + def test_allow(self): + limiter = PerIPRateLimiter(requests_per_minute=60) + assert limiter.allow("127.0.0.1") is True + + def test_exceed_limit(self): + limiter = PerIPRateLimiter(requests_per_minute=3) + for _ in range(3): + limiter.allow("1.2.3.4") + assert limiter.allow("1.2.3.4") is False + + def test_different_ips(self): + limiter = PerIPRateLimiter(requests_per_minute=1) + assert limiter.allow("1.1.1.1") is True + assert limiter.allow("2.2.2.2") is True # Different IP + + +class TestBillingTracker: + """Test billing/usage tracking.""" + + def test_record_usage(self): + tracker = BillingTracker() + tracker.record_usage("key1", "task1", "mathematics") + assert tracker.get_usage_count("key1") == 1 + + def test_usage_summary(self): + tracker = BillingTracker() + tracker.record_usage("key1", "t1", "mathematics") + tracker.record_usage("key1", "t2", "code") + tracker.record_usage("key1", "t3", "mathematics") + + summary = tracker.get_usage_summary("key1") + assert summary["total_requests"] == 3 + assert summary["by_domain"]["mathematics"] == 2 + assert summary["by_domain"]["code"] == 1 + + +class TestSchemas: + """Test Pydantic schemas.""" + + def test_task_submission_valid(self): + req = TaskSubmissionRequest( + problem="This is a valid problem statement for testing.", + domain="mathematics", + difficulty=5, + ) + assert req.problem.startswith("This is") + + def test_task_submission_min_length(self): + with pytest.raises(Exception): + TaskSubmissionRequest(problem="short") + + def test_health_response(self): + resp = HealthResponse( + status="healthy", + version="0.1.0", + uptime_seconds=100.0, + epoch=5, + db_connected=True, + ) + assert resp.status == "healthy" diff --git a/tests/test_integration_local.py b/tests/test_integration_local.py new file mode 100644 index 0000000..1988398 --- /dev/null +++ b/tests/test_integration_local.py @@ -0,0 +1,212 @@ +""" +ReasonForge - Local Integration Tests + +End-to-end tests: validator sends task -> miner processes -> validator scores. +Runs without a real blockchain - tests the full pipeline locally. +""" + +import pytest + +from reasonforge.engine import ScoringEngine +from reasonforge.protocol import ReasoningTask, create_reasoning_task, verify_submission_hash +from reasonforge.security.sanitizer import InputSanitizer +from reasonforge.types import DimensionScores, Domain, MinerState, Task +from reasonforge.validator.scoring import ValidatorScorer +from reasonforge.validator.task_manager import TaskManager +from reasonforge.validator.trap_manager import TrapManager +from reasonforge.validator.weight_setter import WeightSetter + + +class TestFullEpochCycle: + """Test a complete epoch cycle without blockchain.""" + + @pytest.fixture + def task_manager(self): + return TaskManager(benchmark_dir="benchmarks", seed=42) + + @pytest.fixture + def scorer(self): + return ValidatorScorer(lean4_enabled=False, sandbox_enabled=False) + + @pytest.fixture + def weight_setter(self): + return WeightSetter() + + def test_task_generation(self, task_manager): + """Tasks can be generated.""" + tasks = task_manager.generate_epoch_tasks(count=12, trap_rate=0.15) + assert len(tasks) == 12 + # At least 1 trap + traps = [t for t in tasks if t.is_trap] + assert len(traps) >= 1 + + def test_synapse_roundtrip(self): + """Synapse can be created, filled, and deserialized.""" + synapse = create_reasoning_task( + problem="What is 2+2?", + domain="mathematics", + difficulty=2, + ) + assert synapse.task_id + + # Simulate miner filling in response + synapse.reasoning_steps = [{"step_id": 0, "reasoning": "2+2=4", "confidence": 0.99}] + synapse.final_answer = "4" + synapse.time_taken_ms = 100 + synapse.submission_hash = synapse.compute_submission_hash() + + # Verify hash + assert verify_submission_hash(synapse) is True + + # Deserialize + data = synapse.deserialize() + assert data["final_answer"] == "4" + assert len(data["steps"]) == 1 + + @pytest.mark.asyncio + async def test_scoring_pipeline(self, scorer): + """Scorer produces valid dimension scores.""" + task = Task( + task_id="int-001", + problem="Prove sqrt(2) is irrational", + domain=Domain.MATHEMATICS, + difficulty=5, + ) + response = { + "steps": [ + { + "step_id": 0, + "reasoning": "Assume for contradiction that sqrt(2) = p/q in lowest terms.", + "evidence": "Proof by contradiction", + "confidence": 0.9, + }, + { + "step_id": 1, + "reasoning": "Then p^2 = 2q^2, so p is even. Let p=2k, then q^2=2k^2, so q is even too.", + "evidence": "Algebraic manipulation", + "confidence": 0.95, + }, + ], + "final_answer": "sqrt(2) is irrational", + "time_taken_ms": 5000, + } + + dims = await scorer.compute_dimensions(task, response) + assert isinstance(dims, DimensionScores) + assert dims.quality > 0 + assert dims.efficiency > 0 + + # CMS should be valid + cms = ScoringEngine.compute_cms(dims) + assert 0.0 <= cms <= 1.0 + + def test_weight_computation(self, weight_setter): + """Weights are computed and normalized correctly.""" + miner_states = { + 0: {"s_epoch": 0.9, "peb": 0.15}, + 1: {"s_epoch": 0.7, "peb": 0.0}, + 2: {"s_epoch": 0.5, "peb": 0.05}, + 3: {"s_epoch": 0.3, "peb": 0.0}, + } + uids, weights = weight_setter.compute_weights(miner_states, n=10) + + assert len(uids) == 4 + total = sum(weights) + assert abs(total - 1.0) < 1e-6 + + # Best miner should get highest weight + weights_list = list(weights) + assert weights_list[0] > weights_list[1] > weights_list[2] > weights_list[3] + + def test_trap_detection(self): + """Trap problems correctly identify good and bad answers.""" + tm = TrapManager() + trap = Task( + task_id="trap-001", + problem="What is 2+2?", + domain=Domain.MATHEMATICS, + difficulty=2, + is_trap=True, + ground_truth_score=1.0, + ) + + # Good answer + good_score = tm.evaluate_trap_response(trap, "The answer is 4") + assert good_score == 1.0 + + # Bad answer + bad_score = tm.evaluate_trap_response(trap, "The answer is 5") + assert bad_score == 0.0 + + # Record and check penalty + tm.record_trap_score(1, good_score) + tm.record_trap_score(2, bad_score) + + assert tm.get_trap_penalty(1) == 1.0 + assert tm.get_trap_penalty(2) < 1.0 + + def test_input_sanitization(self): + """Oversized inputs are sanitized.""" + synapse = ReasoningTask( + task_id="test", + problem="test", + domain="mathematics", + difficulty=5, + reasoning_steps=[{"reasoning": "x" * 20_000, "confidence": 5.0}] * 60, + final_answer="x" * 100_000, + ) + + InputSanitizer.sanitize_submission(synapse) + + assert len(synapse.reasoning_steps) <= 50 + assert len(synapse.reasoning_steps[0]["reasoning"]) <= 10_000 + assert synapse.reasoning_steps[0]["confidence"] == 1.0 + assert len(synapse.final_answer) <= 50_000 + + def test_emission_conservation(self): + """Emission conservation: all emitted TAO is distributed.""" + total_emission = 100.0 + miners = [ + MinerState(miner_id="m1", s_epoch=0.9, peb=0.15, rank=1, streak=5), + MinerState(miner_id="m2", s_epoch=0.7, peb=0.0, rank=2, streak=0), + MinerState(miner_id="m3", s_epoch=0.5, peb=0.05, rank=3, streak=2), + ] + + miner_pool = total_emission * 0.9 + rewards = ScoringEngine.distribute_miner_emissions(miners, miner_pool) + + assert abs(sum(rewards) - miner_pool) < 1e-6 + + def test_multi_domain_task_generation(self, task_manager): + """Tasks span multiple domains.""" + tasks = task_manager.generate_epoch_tasks(count=24) + domains = set(t.domain for t in tasks) + # Should have tasks from multiple domains + assert len(domains) >= 2 + + +class TestCrashRecovery: + """Test state persistence and recovery.""" + + def test_save_and_restore_state(self, temp_db_path): + from reasonforge.state.database import StateDatabase + + db = StateDatabase(temp_db_path) + + # Save state + state = { + "epoch_id": 10, + "miner_states": { + "0": {"s_epoch": 0.9, "peb": 0.1}, + "1": {"s_epoch": 0.7, "peb": 0.0}, + }, + } + db.save_checkpoint(state, epoch_id=10) + + # Restore + loaded = db.load_latest_checkpoint() + assert loaded is not None + assert loaded["epoch_id"] == 10 + assert loaded["miner_states"]["0"]["s_epoch"] == 0.9 + + db.close() diff --git a/tests/test_protocol.py b/tests/test_protocol.py new file mode 100644 index 0000000..df35a2b --- /dev/null +++ b/tests/test_protocol.py @@ -0,0 +1,165 @@ +""" +ReasonForge - Protocol Tests + +Tests for Synapse serialization, deserialization, and hash verification. +""" + +from reasonforge.protocol import ( + HealthCheck, + ReasoningTask, + TaskResult, + create_reasoning_task, + verify_submission_hash, +) + + +class TestReasoningTaskSynapse: + """Test ReasoningTask Synapse.""" + + def test_create_empty(self): + task = ReasoningTask() + assert task.task_id == "" + assert task.reasoning_steps is None + assert task.final_answer is None + + def test_create_with_fields(self): + task = ReasoningTask( + task_id="t001", + problem="Solve x+1=2", + domain="mathematics", + difficulty=3, + timeout_seconds=120, + ) + assert task.task_id == "t001" + assert task.difficulty == 3 + + def test_deserialize(self): + task = ReasoningTask( + task_id="t001", + problem="Test", + domain="code", + difficulty=5, + final_answer="42", + reasoning_steps=[{"step_id": 0, "reasoning": "think"}], + ) + data = task.deserialize() + assert data["task_id"] == "t001" + assert data["final_answer"] == "42" + assert len(data["steps"]) == 1 + + def test_submission_hash(self): + task = ReasoningTask( + task_id="t001", + reasoning_steps=[{"step_id": 0, "reasoning": "step1"}], + final_answer="answer", + ) + h = task.compute_submission_hash() + assert isinstance(h, str) + assert len(h) == 64 # SHA-256 hex + + def test_submission_hash_consistency(self): + task1 = ReasoningTask( + task_id="t001", + reasoning_steps=[{"step_id": 0, "reasoning": "step1"}], + final_answer="answer", + ) + task2 = ReasoningTask( + task_id="t001", + reasoning_steps=[{"step_id": 0, "reasoning": "step1"}], + final_answer="answer", + ) + assert task1.compute_submission_hash() == task2.compute_submission_hash() + + def test_submission_hash_changes_with_content(self): + task1 = ReasoningTask( + task_id="t001", + final_answer="answer1", + ) + task2 = ReasoningTask( + task_id="t001", + final_answer="answer2", + ) + assert task1.compute_submission_hash() != task2.compute_submission_hash() + + def test_difficulty_validation(self): + # Valid range + task = ReasoningTask(difficulty=1) + assert task.difficulty == 1 + task = ReasoningTask(difficulty=10) + assert task.difficulty == 10 + + +class TestHealthCheckSynapse: + def test_create_empty(self): + hc = HealthCheck() + assert hc.status is None + + def test_deserialize(self): + hc = HealthCheck( + status="ready", + supported_domains=["mathematics", "code"], + version="0.1.0", + ) + data = hc.deserialize() + assert data["status"] == "ready" + assert len(data["supported_domains"]) == 2 + + +class TestTaskResultSynapse: + def test_create(self): + tr = TaskResult(epoch_id=5, miner_uid=42, s_epoch=0.85) + assert tr.epoch_id == 5 + assert tr.miner_uid == 42 + + def test_deserialize(self): + tr = TaskResult( + epoch_id=1, + miner_uid=3, + scores=[{"task_id": "t1", "cms": 0.8}], + s_epoch=0.75, + rank=2, + ) + data = tr.deserialize() + assert data["epoch_id"] == 1 + assert data["rank"] == 2 + + +class TestVerifySubmissionHash: + def test_valid_hash(self): + task = ReasoningTask( + task_id="t001", + reasoning_steps=[{"step_id": 0, "reasoning": "test"}], + final_answer="42", + ) + task.submission_hash = task.compute_submission_hash() + assert verify_submission_hash(task) is True + + def test_invalid_hash(self): + task = ReasoningTask( + task_id="t001", + final_answer="42", + submission_hash="invalid_hash", + ) + assert verify_submission_hash(task) is False + + def test_no_hash(self): + task = ReasoningTask(task_id="t001") + assert verify_submission_hash(task) is False + + +class TestCreateReasoningTask: + def test_factory(self): + task = create_reasoning_task( + problem="Test problem", + domain="code", + difficulty=7, + ) + assert task.problem == "Test problem" + assert task.domain == "code" + assert task.difficulty == 7 + assert len(task.task_id) > 0 + + def test_factory_default_id(self): + t1 = create_reasoning_task() + t2 = create_reasoning_task() + assert t1.task_id != t2.task_id diff --git a/tests/test_scoring.py b/tests/test_scoring.py new file mode 100644 index 0000000..7c9d0f1 --- /dev/null +++ b/tests/test_scoring.py @@ -0,0 +1,207 @@ +""" +ReasonForge - Scoring Pipeline Tests + +Tests for the validator scoring pipeline, dimension scoring, +and integration with the MVP engine. +""" + +import pytest + +from reasonforge.types import DimensionScores, Domain, Task +from reasonforge.validator.consensus import compute_consensus_score +from reasonforge.validator.scoring import ValidatorScorer +from reasonforge.validator.trap_manager import TrapManager +from reasonforge.validator.weight_setter import WeightSetter + + +class TestValidatorScorer: + """Test the ValidatorScorer scoring pipeline.""" + + @pytest.fixture + def scorer(self): + return ValidatorScorer(lean4_enabled=False, sandbox_enabled=False) + + @pytest.fixture + def math_task(self): + return Task( + task_id="test-001", + problem="Prove sqrt(2) is irrational", + domain=Domain.MATHEMATICS, + difficulty=5, + ) + + @pytest.fixture + def good_response(self): + return { + "steps": [ + { + "step_id": 0, + "reasoning": "Assume sqrt(2) = p/q where gcd(p,q)=1. " * 5, + "evidence": "Proof by contradiction", + "confidence": 0.9, + }, + { + "step_id": 1, + "reasoning": "Then p^2 = 2q^2 implies p is even. " * 5, + "evidence": "Even/odd argument", + "confidence": 0.85, + }, + { + "step_id": 2, + "reasoning": "Let p=2k, then q is also even. Contradiction. " * 5, + "evidence": "Both even contradicts gcd(p,q)=1", + "confidence": 0.95, + }, + ], + "final_answer": "sqrt(2) is irrational by proof by contradiction.", + "time_taken_ms": 5000, + } + + @pytest.fixture + def empty_response(self): + return { + "steps": [], + "final_answer": "", + "time_taken_ms": 100, + } + + def test_quality_good_response(self, scorer, math_task, good_response): + quality = scorer._score_quality(math_task, good_response) + assert 0.5 < quality <= 1.0 + + def test_quality_empty_response(self, scorer, math_task, empty_response): + quality = scorer._score_quality(math_task, empty_response) + assert quality == 0.0 + + def test_novelty_good_response(self, scorer, math_task, good_response): + novelty = scorer._score_novelty(math_task, good_response) + assert 0.0 <= novelty <= 1.0 + assert novelty > 0.0 + + def test_novelty_empty_response(self, scorer, math_task, empty_response): + novelty = scorer._score_novelty(math_task, empty_response) + assert novelty == 0.0 + + def test_efficiency_normal_time(self, scorer, math_task, good_response): + efficiency = scorer._score_efficiency(math_task, good_response) + assert 0.5 < efficiency <= 1.0 + + def test_efficiency_timeout(self, scorer, math_task): + response = {"time_taken_ms": 400_000} # > 300s timeout + efficiency = scorer._score_efficiency(math_task, response) + assert efficiency == 0.0 + + def test_efficiency_suspiciously_fast(self, scorer, math_task): + response = {"time_taken_ms": 10} # < 1% of timeout + efficiency = scorer._score_efficiency(math_task, response) + assert efficiency == 0.2 + + @pytest.mark.asyncio + async def test_compute_dimensions(self, scorer, math_task, good_response): + dims = await scorer.compute_dimensions(math_task, good_response) + assert isinstance(dims, DimensionScores) + assert 0.0 <= dims.quality <= 1.0 + assert 0.0 <= dims.accuracy <= 1.0 + assert 0.0 <= dims.novelty <= 1.0 + assert 0.0 <= dims.efficiency <= 1.0 + + +class TestTrapManager: + """Test trap problem management.""" + + def test_evaluate_math_trap_correct(self, sample_trap_task): + tm = TrapManager() + score = tm.evaluate_trap_response(sample_trap_task, "The answer is 4") + assert score == 1.0 + + def test_evaluate_math_trap_wrong(self, sample_trap_task): + tm = TrapManager() + score = tm.evaluate_trap_response(sample_trap_task, "The answer is 5") + assert score == 0.0 + + def test_evaluate_empty_answer(self, sample_trap_task): + tm = TrapManager() + score = tm.evaluate_trap_response(sample_trap_task, "") + assert score == 0.0 + + def test_record_and_retrieve(self): + tm = TrapManager() + tm.record_trap_score(1, 0.8) + tm.record_trap_score(1, 0.9) + scores = tm.get_trap_scores(1) + assert len(scores) == 2 + assert scores[0] == 0.8 + + def test_trap_penalty(self): + tm = TrapManager() + tm.record_trap_score(1, 0.9) + tm.record_trap_score(1, 0.8) + penalty = tm.get_trap_penalty(1) + assert penalty == 1.0 # Above threshold + + def test_trap_penalty_low_scores(self): + tm = TrapManager() + tm.record_trap_score(1, 0.1) + tm.record_trap_score(1, 0.2) + penalty = tm.get_trap_penalty(1) + assert penalty < 1.0 # Below threshold + + +class TestWeightSetter: + """Test weight computation.""" + + def test_compute_weights_basic(self): + ws = WeightSetter() + states = { + 0: {"s_epoch": 0.8, "peb": 0.1}, + 1: {"s_epoch": 0.6, "peb": 0.0}, + 2: {"s_epoch": 0.4, "peb": 0.05}, + } + uids, weights = ws.compute_weights(states, n=10) + assert len(uids) == 3 + assert len(weights) == 3 + # Weights should be normalized + total = sum(weights) if not hasattr(weights, "sum") else float(weights.sum()) + assert abs(total - 1.0) < 1e-6 + + def test_compute_weights_empty(self): + ws = WeightSetter() + uids, weights = ws.compute_weights({}, n=10) + assert len(uids) == 0 + + def test_compute_weights_all_zero(self): + ws = WeightSetter() + states = { + 0: {"s_epoch": 0.0, "peb": 0.0}, + 1: {"s_epoch": 0.0, "peb": 0.0}, + } + uids, weights = ws.compute_weights(states, n=10) + assert len(uids) == 0 + + def test_higher_score_higher_weight(self): + ws = WeightSetter() + states = { + 0: {"s_epoch": 0.9, "peb": 0.2}, + 1: {"s_epoch": 0.3, "peb": 0.0}, + } + uids, weights = ws.compute_weights(states, n=10) + weights_list = list(weights) if not hasattr(weights, "tolist") else weights.tolist() + assert weights_list[0] > weights_list[1] + + +class TestConsensus: + """Test consensus scoring.""" + + def test_basic_consensus(self): + scores = [(0.8, 100.0), (0.7, 100.0), (0.9, 100.0)] + result = compute_consensus_score(scores) + assert 0.0 <= result <= 1.0 + + def test_empty_consensus(self): + assert compute_consensus_score([]) == 0.0 + + def test_consensus_with_different_stakes(self): + scores = [(0.9, 1000.0), (0.5, 10.0)] + result = compute_consensus_score(scores) + # High-stake validator should dominate + assert result > 0.6 diff --git a/tests/test_security.py b/tests/test_security.py new file mode 100644 index 0000000..0d45ccf --- /dev/null +++ b/tests/test_security.py @@ -0,0 +1,187 @@ +""" +ReasonForge - Security Tests + +Tests for input sanitization, rate guarding, and anomaly detection. +""" + +import base64 + +import pytest + +from reasonforge.security.anomaly import AnomalyDetector +from reasonforge.security.rate_guard import RateGuard +from reasonforge.security.sanitizer import InputSanitizer + + +class TestInputSanitizer: + """Test input sanitization.""" + + def test_sanitize_oversized_steps(self): + class FakeResponse: + reasoning_steps = [{"reasoning": "x" * 20_000, "confidence": 0.5}] * 60 + final_answer = "answer" + proof_artifact = None + code_artifact = None + + resp = FakeResponse() + InputSanitizer.sanitize_submission(resp) + assert len(resp.reasoning_steps) <= 50 + assert len(resp.reasoning_steps[0]["reasoning"]) <= 10_000 + + def test_sanitize_confidence_range(self): + class FakeResponse: + reasoning_steps = [ + {"reasoning": "test", "confidence": 5.0}, + {"reasoning": "test", "confidence": -1.0}, + ] + final_answer = "answer" + proof_artifact = None + code_artifact = None + + resp = FakeResponse() + InputSanitizer.sanitize_submission(resp) + assert resp.reasoning_steps[0]["confidence"] == 1.0 + assert resp.reasoning_steps[1]["confidence"] == 0.0 + + def test_sanitize_oversized_answer(self): + class FakeResponse: + reasoning_steps = [] + final_answer = "x" * 100_000 + proof_artifact = None + code_artifact = None + + resp = FakeResponse() + InputSanitizer.sanitize_submission(resp) + assert len(resp.final_answer) <= 50_000 + + def test_sanitize_oversized_proof(self): + big_proof = base64.b64encode(b"x" * 2_000_000).decode() + + class FakeResponse: + reasoning_steps = [] + final_answer = "answer" + proof_artifact = big_proof + code_artifact = None + + resp = FakeResponse() + InputSanitizer.sanitize_submission(resp) + assert resp.proof_artifact is None + + def test_sanitize_problem_xss(self): + problem = 'Real problem here.' + sanitized = InputSanitizer.sanitize_problem(problem) + assert "