vectorcode/.github/workflows/benchmark.yml at main · alejandro-technology/vectorcode · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
name: Benchmark (Not run for now. Se demora demasiado en ejecutar mas de 2 horas)

on:
  workflow_dispatch:

env:
  CARGO_TERM_COLOR: always
  RUST_BACKTRACE: 1
  VECTORCODE_PROVIDER: onnx

jobs:
  benchmark:
    name: Benchmark (mini-corpus)
    runs-on: macos-14  # ARM runner for ONNX performance
    steps:
      - uses: actions/checkout@v4

      - name: Install Rust toolchain
        uses: dtolnay/rust-toolchain@stable

      - name: Cache cargo registry and build
        uses: Swatinem/rust-cache@v2

      - name: Cache ONNX model
        uses: actions/cache@v4
        with:
          path: ~/.vectorcode/models
          key: onnx-model-all-MiniLM-L6-v2
          restore-keys: |
            onnx-model-

      - name: Download ONNX model (if not cached)
        run: |
          if [ ! -d "$HOME/.vectorcode/models" ]; then
            echo "ONNX model not cached, will download on first run"
          fi

      - name: Build release binary
        run: cargo build --release

      - name: Run mini-corpus benchmark
        run: |
          cargo run --release -- benchmark --corpus mini --output table

      - name: Validate Indexing Footprint (vscode corpus)
        run: |
          cargo run --release -- bench-store --corpus vscode

      - name: Verify against committed baselines
        # Phase 4.1: the mock-mini regression gate. Runs scripts/verify-baseline.sh
        # which executes three --compare flows (IR / structural / store) and
        # exits non-zero on regression. The PR is blocked when this fails.
        run: |
          bash scripts/verify-baseline.sh

      - name: Upload benchmark results
        if: always()
        uses: actions/upload-artifact@v4
        with:
          name: benchmark-results
          path: |
            benchmark-*.json
            benchmarks/baseline/delta-report.json
            BASELINE.md
          retention-days: 30

      - name: Comment PR with results
        if: github.event_name == 'pull_request' && always()
        uses: actions/github-script@v7
        with:
          script: |
            const fs = require('fs');
            let body = '## Benchmark Results (mini-corpus)\n\n';

            try {
              const results = fs.readFileSync('benchmark-mini.json', 'utf8');
              const data = JSON.parse(results);

              body += `**Corpus**: ${data.corpus}\n`;
              body += `**Files indexed**: ${data.files_indexed}\n`;
              body += `**Queries executed**: ${data.queries_executed}\n`;
              body += `**Duration**: ${data.duration_secs.toFixed(2)}s\n\n`;

              body += '### Aggregate Metrics\n\n';
              body += '| Metric | Value |\n';
              body += '|--------|-------|\n';
              body += `| Recall@5 | ${data.aggregate.recall_at_5.toFixed(4)} |\n`;
              body += `| Recall@10 | ${data.aggregate.recall_at_10.toFixed(4)} |\n`;
              body += `| nDCG@10 | ${data.aggregate.ndcg_at_10.toFixed(4)} |\n`;
              body += `| MRR | ${data.aggregate.mrr.toFixed(4)} |\n`;

              body += '\n<details><summary>Per-query results</summary>\n\n';
              body += '| Query | R@5 | R@10 | nDCG | MRR |\n';
              body += '|-------|-----|------|------|-----|\n';

              for (const qr of data.query_results.slice(0, 10)) {
                const query = qr.query.substring(0, 40);
                body += `| ${query} | ${qr.recall_at_5.toFixed(2)} | ${qr.recall_at_10.toFixed(2)} | ${qr.ndcg_at_10.toFixed(2)} | ${qr.mrr.toFixed(2)} |\n`;
              }

              if (data.query_results.length > 10) {
                body += `| ... and ${data.query_results.length - 10} more | | | | |\n`;
              }

              body += '\n</details>\n';
            } catch (err) {
              body += '⚠️ Could not parse benchmark results\n';
            }

            github.rest.issues.createComment({
              issue_number: context.issue.number,
              owner: context.repo.owner,
              repo: context.repo.repo,
              body: body
            });