From 83f7d33a803e0fb229d286fe6d377e380f994809 Mon Sep 17 00:00:00 2001 From: Connor Tsui Date: Fri, 3 Apr 2026 18:32:04 -0400 Subject: [PATCH] Revert "Add comment showing change in benchmark file sizes (#7264)" This reverts commit 16bbd12a908511db1f0aac9ea42dd330d25ba2b9. Signed-off-by: Connor Tsui --- .github/workflows/sql-benchmarks.yml | 62 +------- scripts/capture-file-sizes.py | 97 ------------ scripts/compare-file-sizes.py | 145 ------------------ .../src/statpopgen/statpopgen_benchmark.rs | 2 +- 4 files changed, 2 insertions(+), 304 deletions(-) delete mode 100644 scripts/capture-file-sizes.py delete mode 100644 scripts/compare-file-sizes.py diff --git a/.github/workflows/sql-benchmarks.yml b/.github/workflows/sql-benchmarks.yml index f514fa0b7ab..08afe175341 100644 --- a/.github/workflows/sql-benchmarks.yml +++ b/.github/workflows/sql-benchmarks.yml @@ -223,6 +223,7 @@ jobs: ${{ matrix.scale_factor && format('--scale-factor {0}', matrix.scale_factor) || '' }} - name: Install uv + if: inputs.mode == 'pr' uses: spiraldb/actions/.github/actions/setup-uv@0.18.5 with: sync: false @@ -259,56 +260,6 @@ jobs: # unique benchmark configuration must have a unique comment-tag. comment-tag: bench-pr-comment-${{ matrix.id }} - - name: Compare file sizes - if: inputs.mode == 'pr' && matrix.remote_storage == null - shell: bash - run: | - set -Eeu -o pipefail -x - - # Capture HEAD file sizes (vortex formats only) - uv run --no-project scripts/capture-file-sizes.py \ - vortex-bench/data \ - --benchmark ${{ matrix.subcommand }} \ - --commit ${{ github.event.pull_request.head.sha }} \ - -o head-sizes.json - - # Get base commit SHA (same as benchmark comparison) - base_commit_sha=$(\ - curl -L \ - -H "Accept: application/vnd.github+json" \ - -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \ - https://api.github.com/repos/vortex-data/vortex/actions/workflows/bench.yml/runs\?branch\=develop\&status\=success\&per_page\=1 \ - | jq -r '.workflow_runs[].head_sha' \ - ) - - # Download file sizes baseline (per-benchmark file) - python3 scripts/s3-download.py s3://vortex-ci-benchmark-results/file-sizes-${{ matrix.id }}.json.gz file-sizes.json.gz --no-sign-request || true - - # Generate comparison report - echo '# File Sizes: ${{ matrix.name }}' > sizes-comment.md - echo '' >> sizes-comment.md - - if [ -f file-sizes.json.gz ]; then - gzip -d -c file-sizes.json.gz | grep $base_commit_sha > base-sizes.json || true - if [ -s base-sizes.json ]; then - uv run --no-project scripts/compare-file-sizes.py base-sizes.json head-sizes.json \ - >> sizes-comment.md - else - echo '_No baseline file sizes found for base commit._' >> sizes-comment.md - fi - else - echo '_No baseline file sizes available yet._' >> sizes-comment.md - fi - - cat sizes-comment.md >> $GITHUB_STEP_SUMMARY - - - name: Comment PR with file sizes - if: inputs.mode == 'pr' && matrix.remote_storage == null && github.event.pull_request.head.repo.fork == false - uses: thollander/actions-comment-pull-request@v3 - with: - file-path: sizes-comment.md - comment-tag: file-sizes-${{ matrix.id }} - - name: Comment PR on failure if: failure() && inputs.mode == 'pr' && github.event.pull_request.head.repo.fork == false uses: thollander/actions-comment-pull-request@v3 @@ -325,17 +276,6 @@ jobs: run: | bash scripts/cat-s3.sh vortex-ci-benchmark-results data.json.gz results.json - - name: Upload File Sizes - if: inputs.mode == 'develop' && matrix.remote_storage == null - shell: bash - run: | - uv run --no-project scripts/capture-file-sizes.py \ - vortex-bench/data \ - --benchmark ${{ matrix.subcommand }} \ - --commit ${{ github.sha }} \ - -o sizes.json - bash scripts/cat-s3.sh vortex-ci-benchmark-results file-sizes-${{ matrix.id }}.json.gz sizes.json - - name: Alert incident.io if: failure() && inputs.mode == 'develop' uses: ./.github/actions/alert-incident-io diff --git a/scripts/capture-file-sizes.py b/scripts/capture-file-sizes.py deleted file mode 100644 index 754df1ee702..00000000000 --- a/scripts/capture-file-sizes.py +++ /dev/null @@ -1,97 +0,0 @@ -#!/usr/bin/env python3 -# /// script -# requires-python = ">=3.11" -# dependencies = [] -# /// - -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright the Vortex contributors - -"""Capture file sizes from benchmark data directories and output as JSONL.""" - -import argparse -import json -import sys -from pathlib import Path - - -def main(): - parser = argparse.ArgumentParser(description="Capture file sizes from benchmark data directories") - parser.add_argument("data_dir", help="Data directory (e.g., vortex-bench/data)") - parser.add_argument("--benchmark", required=True, help="Benchmark name (e.g., clickbench)") - parser.add_argument("--commit", required=True, help="Commit SHA") - parser.add_argument("-o", "--output", required=True, help="Output JSONL file path") - args = parser.parse_args() - - data_dir = Path(args.data_dir) - if not data_dir.exists(): - print(f"Data directory not found: {data_dir}", file=sys.stderr) - sys.exit(1) - - # Find benchmark directories matching the name (handles flavors like clickbench_partitioned) - # Also handles exact match (e.g., tpch) - benchmark_dirs = [ - d - for d in data_dir.iterdir() - if d.is_dir() and (d.name == args.benchmark or d.name.startswith(f"{args.benchmark}_")) - ] - - if not benchmark_dirs: - print(f"No benchmark directories found matching: {args.benchmark}", file=sys.stderr) - sys.exit(1) - - # Formats to capture (vortex formats only, not parquet/duckdb) - # Note: "vortex" CLI arg maps to "vortex-file-compressed" directory name - formats_to_capture = {"vortex-file-compressed", "vortex-compact"} - - records = [] - - # Walk subdirectories looking for format directories - # Handle both direct format dirs (clickbench_partitioned/vortex-file-compressed/) - # and scale factor subdirs (tpch/1.0/vortex-file-compressed/) - for benchmark_dir in benchmark_dirs: - for format_dir in benchmark_dir.rglob("*"): - if not format_dir.is_dir(): - continue - - format_name = format_dir.name - if format_name not in formats_to_capture: - continue - - # Extract scale factor from path (e.g., "1.0" for tpch/1.0/vortex-file-compressed) - # Default to "1.0" if no intermediate directory (e.g., clickbench) - path_between = format_dir.relative_to(benchmark_dir).parent - scale_factor = str(path_between) if str(path_between) != "." else "1.0" - - # Capture all files in this format directory - for file_path in format_dir.rglob("*"): - if not file_path.is_file(): - continue - - size_bytes = file_path.stat().st_size - relative_path = file_path.relative_to(format_dir) - - records.append( - { - "commit_id": args.commit, - "benchmark": args.benchmark, - "scale_factor": scale_factor, - "format": format_name, - "file": str(relative_path), - "size_bytes": size_bytes, - } - ) - - # Sort for deterministic output - records.sort(key=lambda r: (r["benchmark"], r["scale_factor"], r["format"], r["file"])) - - # Write JSONL output - with open(args.output, "w") as f: - for record in records: - f.write(json.dumps(record) + "\n") - - print(f"Captured {len(records)} file sizes to {args.output}", file=sys.stderr) - - -if __name__ == "__main__": - main() diff --git a/scripts/compare-file-sizes.py b/scripts/compare-file-sizes.py deleted file mode 100644 index f840576c097..00000000000 --- a/scripts/compare-file-sizes.py +++ /dev/null @@ -1,145 +0,0 @@ -#!/usr/bin/env python3 -# /// script -# requires-python = ">=3.11" -# dependencies = [] -# /// - -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright the Vortex contributors - -"""Compare file sizes between base and HEAD and generate markdown report.""" - -import argparse -import json -import sys -from collections import defaultdict - - -def format_size(size_bytes: int) -> str: - """Format bytes as human-readable size.""" - if size_bytes >= 1024**3: - return f"{size_bytes / (1024**3):.2f} GB" - elif size_bytes >= 1024**2: - return f"{size_bytes / (1024**2):.2f} MB" - elif size_bytes >= 1024: - return f"{size_bytes / 1024:.2f} KB" - else: - return f"{size_bytes} B" - - -def format_change(change_bytes: int) -> str: - """Format byte change with sign.""" - sign = "+" if change_bytes > 0 else "" - return f"{sign}{format_size(abs(change_bytes))}" - - -def format_pct_change(pct: float) -> str: - """Format percentage change with sign.""" - sign = "+" if pct > 0 else "" - return f"{sign}{pct:.1f}%" - - -def main(): - parser = argparse.ArgumentParser(description="Compare file sizes between base and HEAD") - parser.add_argument("base_file", help="Base JSONL file") - parser.add_argument("head_file", help="HEAD JSONL file") - args = parser.parse_args() - - # Load base and head data - base_data = {} - try: - with open(args.base_file) as f: - for line in f: - record = json.loads(line) - # Support old records without scale_factor (default to "1.0") - scale_factor = record.get("scale_factor", "1.0") - key = (record["benchmark"], scale_factor, record["format"], record["file"]) - base_data[key] = record["size_bytes"] - except FileNotFoundError: - print("_Base file sizes not found._") - sys.exit(0) - - head_data = {} - try: - with open(args.head_file) as f: - for line in f: - record = json.loads(line) - scale_factor = record.get("scale_factor", "1.0") - key = (record["benchmark"], scale_factor, record["format"], record["file"]) - head_data[key] = record["size_bytes"] - except FileNotFoundError: - print("_HEAD file sizes not found._") - sys.exit(0) - - # Compare sizes - comparisons = [] - format_totals = defaultdict(lambda: {"base": 0, "head": 0}) - - all_keys = set(base_data.keys()) | set(head_data.keys()) - for key in all_keys: - benchmark, scale_factor, fmt, file_name = key - base_size = base_data.get(key, 0) - head_size = head_data.get(key, 0) - - format_totals[fmt]["base"] += base_size - format_totals[fmt]["head"] += head_size - - change = head_size - base_size - if change == 0: - continue - - if base_size > 0: - pct_change = (head_size / base_size - 1) * 100 - elif head_size > 0: - pct_change = float("inf") - else: - pct_change = 0 - - comparisons.append( - { - "file": file_name, - "scale_factor": scale_factor, - "format": fmt, - "base_size": base_size, - "head_size": head_size, - "change": change, - "pct_change": pct_change, - } - ) - - if not comparisons: - print("_No file size changes detected._") - return - - # Sort by pct_change descending (largest increases first) - comparisons.sort(key=lambda x: x["pct_change"], reverse=True) - - # Output markdown table - print("| File | Scale | Format | Base | HEAD | Change | % |") - print("|------|-------|--------|------|------|--------|---|") - - for comp in comparisons: - pct_str = format_pct_change(comp["pct_change"]) if comp["pct_change"] != float("inf") else "new" - base_str = format_size(comp["base_size"]) if comp["base_size"] > 0 else "-" - print( - f"| {comp['file']} | {comp['scale_factor']} | {comp['format']} | {base_str} | " - f"{format_size(comp['head_size'])} | {format_change(comp['change'])} | {pct_str} |" - ) - - # Output totals - print("") - print("**Totals:**") - for fmt in sorted(format_totals.keys()): - totals = format_totals[fmt] - base_total = totals["base"] - head_total = totals["head"] - if base_total > 0: - total_pct = (head_total / base_total - 1) * 100 - pct_str = f" ({format_pct_change(total_pct)})" - else: - pct_str = "" - print(f"- {fmt}: {format_size(base_total)} \u2192 {format_size(head_total)}{pct_str}") - - -if __name__ == "__main__": - main() diff --git a/vortex-bench/src/statpopgen/statpopgen_benchmark.rs b/vortex-bench/src/statpopgen/statpopgen_benchmark.rs index dc5fa6448ad..941859dc90b 100644 --- a/vortex-bench/src/statpopgen/statpopgen_benchmark.rs +++ b/vortex-bench/src/statpopgen/statpopgen_benchmark.rs @@ -52,7 +52,7 @@ impl StatPopGenBenchmark { ) })?; - let data_path = "statpopgen".to_data_path().join(format!("{n_rows}/")); + let data_path = "statspopgen".to_data_path().join(format!("{n_rows}/")); let data_url = Url::from_directory_path(data_path).map_err(|_| anyhow::anyhow!("bad data path?"))?;