From 83f7d33a803e0fb229d286fe6d377e380f994809 Mon Sep 17 00:00:00 2001
From: Connor Tsui <connor.tsui20@gmail.com>
Date: Fri, 3 Apr 2026 18:32:04 -0400
Subject: [PATCH] Revert "Add comment showing change in benchmark file sizes
 (#7264)"

This reverts commit 16bbd12a908511db1f0aac9ea42dd330d25ba2b9.

Signed-off-by: Connor Tsui <connor.tsui20@gmail.com>
---
 .github/workflows/sql-benchmarks.yml          |  62 +-------
 scripts/capture-file-sizes.py                 |  97 ------------
 scripts/compare-file-sizes.py                 | 145 ------------------
 .../src/statpopgen/statpopgen_benchmark.rs    |   2 +-
 4 files changed, 2 insertions(+), 304 deletions(-)
 delete mode 100644 scripts/capture-file-sizes.py
 delete mode 100644 scripts/compare-file-sizes.py

diff --git a/.github/workflows/sql-benchmarks.yml b/.github/workflows/sql-benchmarks.yml
index f514fa0b7ab..08afe175341 100644
--- a/.github/workflows/sql-benchmarks.yml
+++ b/.github/workflows/sql-benchmarks.yml
@@ -223,6 +223,7 @@ jobs:
             ${{ matrix.scale_factor && format('--scale-factor {0}', matrix.scale_factor) || '' }}
 
       - name: Install uv
+        if: inputs.mode == 'pr'
         uses: spiraldb/actions/.github/actions/setup-uv@0.18.5
         with:
           sync: false
@@ -259,56 +260,6 @@ jobs:
           # unique benchmark configuration must have a unique comment-tag.
           comment-tag: bench-pr-comment-${{ matrix.id }}
 
-      - name: Compare file sizes
-        if: inputs.mode == 'pr' && matrix.remote_storage == null
-        shell: bash
-        run: |
-          set -Eeu -o pipefail -x
-
-          # Capture HEAD file sizes (vortex formats only)
-          uv run --no-project scripts/capture-file-sizes.py \
-            vortex-bench/data \
-            --benchmark ${{ matrix.subcommand }} \
-            --commit ${{ github.event.pull_request.head.sha }} \
-            -o head-sizes.json
-
-          # Get base commit SHA (same as benchmark comparison)
-          base_commit_sha=$(\
-            curl -L \
-              -H "Accept: application/vnd.github+json" \
-              -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
-              https://api.github.com/repos/vortex-data/vortex/actions/workflows/bench.yml/runs\?branch\=develop\&status\=success\&per_page\=1 \
-            | jq -r '.workflow_runs[].head_sha' \
-          )
-
-          # Download file sizes baseline (per-benchmark file)
-          python3 scripts/s3-download.py s3://vortex-ci-benchmark-results/file-sizes-${{ matrix.id }}.json.gz file-sizes.json.gz --no-sign-request || true
-
-          # Generate comparison report
-          echo '# File Sizes: ${{ matrix.name }}' > sizes-comment.md
-          echo '' >> sizes-comment.md
-
-          if [ -f file-sizes.json.gz ]; then
-            gzip -d -c file-sizes.json.gz | grep $base_commit_sha > base-sizes.json || true
-            if [ -s base-sizes.json ]; then
-              uv run --no-project scripts/compare-file-sizes.py base-sizes.json head-sizes.json \
-                >> sizes-comment.md
-            else
-              echo '_No baseline file sizes found for base commit._' >> sizes-comment.md
-            fi
-          else
-            echo '_No baseline file sizes available yet._' >> sizes-comment.md
-          fi
-
-          cat sizes-comment.md >> $GITHUB_STEP_SUMMARY
-
-      - name: Comment PR with file sizes
-        if: inputs.mode == 'pr' && matrix.remote_storage == null && github.event.pull_request.head.repo.fork == false
-        uses: thollander/actions-comment-pull-request@v3
-        with:
-          file-path: sizes-comment.md
-          comment-tag: file-sizes-${{ matrix.id }}
-
       - name: Comment PR on failure
         if: failure() && inputs.mode == 'pr' && github.event.pull_request.head.repo.fork == false
         uses: thollander/actions-comment-pull-request@v3
@@ -325,17 +276,6 @@ jobs:
         run: |
           bash scripts/cat-s3.sh vortex-ci-benchmark-results data.json.gz results.json
 
-      - name: Upload File Sizes
-        if: inputs.mode == 'develop' && matrix.remote_storage == null
-        shell: bash
-        run: |
-          uv run --no-project scripts/capture-file-sizes.py \
-            vortex-bench/data \
-            --benchmark ${{ matrix.subcommand }} \
-            --commit ${{ github.sha }} \
-            -o sizes.json
-          bash scripts/cat-s3.sh vortex-ci-benchmark-results file-sizes-${{ matrix.id }}.json.gz sizes.json
-
       - name: Alert incident.io
         if: failure() && inputs.mode == 'develop'
         uses: ./.github/actions/alert-incident-io
diff --git a/scripts/capture-file-sizes.py b/scripts/capture-file-sizes.py
deleted file mode 100644
index 754df1ee702..00000000000
--- a/scripts/capture-file-sizes.py
+++ /dev/null
@@ -1,97 +0,0 @@
-#!/usr/bin/env python3
-# /// script
-# requires-python = ">=3.11"
-# dependencies = []
-# ///
-
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright the Vortex contributors
-
-"""Capture file sizes from benchmark data directories and output as JSONL."""
-
-import argparse
-import json
-import sys
-from pathlib import Path
-
-
-def main():
-    parser = argparse.ArgumentParser(description="Capture file sizes from benchmark data directories")
-    parser.add_argument("data_dir", help="Data directory (e.g., vortex-bench/data)")
-    parser.add_argument("--benchmark", required=True, help="Benchmark name (e.g., clickbench)")
-    parser.add_argument("--commit", required=True, help="Commit SHA")
-    parser.add_argument("-o", "--output", required=True, help="Output JSONL file path")
-    args = parser.parse_args()
-
-    data_dir = Path(args.data_dir)
-    if not data_dir.exists():
-        print(f"Data directory not found: {data_dir}", file=sys.stderr)
-        sys.exit(1)
-
-    # Find benchmark directories matching the name (handles flavors like clickbench_partitioned)
-    # Also handles exact match (e.g., tpch)
-    benchmark_dirs = [
-        d
-        for d in data_dir.iterdir()
-        if d.is_dir() and (d.name == args.benchmark or d.name.startswith(f"{args.benchmark}_"))
-    ]
-
-    if not benchmark_dirs:
-        print(f"No benchmark directories found matching: {args.benchmark}", file=sys.stderr)
-        sys.exit(1)
-
-    # Formats to capture (vortex formats only, not parquet/duckdb)
-    # Note: "vortex" CLI arg maps to "vortex-file-compressed" directory name
-    formats_to_capture = {"vortex-file-compressed", "vortex-compact"}
-
-    records = []
-
-    # Walk subdirectories looking for format directories
-    # Handle both direct format dirs (clickbench_partitioned/vortex-file-compressed/)
-    # and scale factor subdirs (tpch/1.0/vortex-file-compressed/)
-    for benchmark_dir in benchmark_dirs:
-        for format_dir in benchmark_dir.rglob("*"):
-            if not format_dir.is_dir():
-                continue
-
-            format_name = format_dir.name
-            if format_name not in formats_to_capture:
-                continue
-
-            # Extract scale factor from path (e.g., "1.0" for tpch/1.0/vortex-file-compressed)
-            # Default to "1.0" if no intermediate directory (e.g., clickbench)
-            path_between = format_dir.relative_to(benchmark_dir).parent
-            scale_factor = str(path_between) if str(path_between) != "." else "1.0"
-
-            # Capture all files in this format directory
-            for file_path in format_dir.rglob("*"):
-                if not file_path.is_file():
-                    continue
-
-                size_bytes = file_path.stat().st_size
-                relative_path = file_path.relative_to(format_dir)
-
-                records.append(
-                    {
-                        "commit_id": args.commit,
-                        "benchmark": args.benchmark,
-                        "scale_factor": scale_factor,
-                        "format": format_name,
-                        "file": str(relative_path),
-                        "size_bytes": size_bytes,
-                    }
-                )
-
-    # Sort for deterministic output
-    records.sort(key=lambda r: (r["benchmark"], r["scale_factor"], r["format"], r["file"]))
-
-    # Write JSONL output
-    with open(args.output, "w") as f:
-        for record in records:
-            f.write(json.dumps(record) + "\n")
-
-    print(f"Captured {len(records)} file sizes to {args.output}", file=sys.stderr)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/scripts/compare-file-sizes.py b/scripts/compare-file-sizes.py
deleted file mode 100644
index f840576c097..00000000000
--- a/scripts/compare-file-sizes.py
+++ /dev/null
@@ -1,145 +0,0 @@
-#!/usr/bin/env python3
-# /// script
-# requires-python = ">=3.11"
-# dependencies = []
-# ///
-
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright the Vortex contributors
-
-"""Compare file sizes between base and HEAD and generate markdown report."""
-
-import argparse
-import json
-import sys
-from collections import defaultdict
-
-
-def format_size(size_bytes: int) -> str:
-    """Format bytes as human-readable size."""
-    if size_bytes >= 1024**3:
-        return f"{size_bytes / (1024**3):.2f} GB"
-    elif size_bytes >= 1024**2:
-        return f"{size_bytes / (1024**2):.2f} MB"
-    elif size_bytes >= 1024:
-        return f"{size_bytes / 1024:.2f} KB"
-    else:
-        return f"{size_bytes} B"
-
-
-def format_change(change_bytes: int) -> str:
-    """Format byte change with sign."""
-    sign = "+" if change_bytes > 0 else ""
-    return f"{sign}{format_size(abs(change_bytes))}"
-
-
-def format_pct_change(pct: float) -> str:
-    """Format percentage change with sign."""
-    sign = "+" if pct > 0 else ""
-    return f"{sign}{pct:.1f}%"
-
-
-def main():
-    parser = argparse.ArgumentParser(description="Compare file sizes between base and HEAD")
-    parser.add_argument("base_file", help="Base JSONL file")
-    parser.add_argument("head_file", help="HEAD JSONL file")
-    args = parser.parse_args()
-
-    # Load base and head data
-    base_data = {}
-    try:
-        with open(args.base_file) as f:
-            for line in f:
-                record = json.loads(line)
-                # Support old records without scale_factor (default to "1.0")
-                scale_factor = record.get("scale_factor", "1.0")
-                key = (record["benchmark"], scale_factor, record["format"], record["file"])
-                base_data[key] = record["size_bytes"]
-    except FileNotFoundError:
-        print("_Base file sizes not found._")
-        sys.exit(0)
-
-    head_data = {}
-    try:
-        with open(args.head_file) as f:
-            for line in f:
-                record = json.loads(line)
-                scale_factor = record.get("scale_factor", "1.0")
-                key = (record["benchmark"], scale_factor, record["format"], record["file"])
-                head_data[key] = record["size_bytes"]
-    except FileNotFoundError:
-        print("_HEAD file sizes not found._")
-        sys.exit(0)
-
-    # Compare sizes
-    comparisons = []
-    format_totals = defaultdict(lambda: {"base": 0, "head": 0})
-
-    all_keys = set(base_data.keys()) | set(head_data.keys())
-    for key in all_keys:
-        benchmark, scale_factor, fmt, file_name = key
-        base_size = base_data.get(key, 0)
-        head_size = head_data.get(key, 0)
-
-        format_totals[fmt]["base"] += base_size
-        format_totals[fmt]["head"] += head_size
-
-        change = head_size - base_size
-        if change == 0:
-            continue
-
-        if base_size > 0:
-            pct_change = (head_size / base_size - 1) * 100
-        elif head_size > 0:
-            pct_change = float("inf")
-        else:
-            pct_change = 0
-
-        comparisons.append(
-            {
-                "file": file_name,
-                "scale_factor": scale_factor,
-                "format": fmt,
-                "base_size": base_size,
-                "head_size": head_size,
-                "change": change,
-                "pct_change": pct_change,
-            }
-        )
-
-    if not comparisons:
-        print("_No file size changes detected._")
-        return
-
-    # Sort by pct_change descending (largest increases first)
-    comparisons.sort(key=lambda x: x["pct_change"], reverse=True)
-
-    # Output markdown table
-    print("| File | Scale | Format | Base | HEAD | Change | % |")
-    print("|------|-------|--------|------|------|--------|---|")
-
-    for comp in comparisons:
-        pct_str = format_pct_change(comp["pct_change"]) if comp["pct_change"] != float("inf") else "new"
-        base_str = format_size(comp["base_size"]) if comp["base_size"] > 0 else "-"
-        print(
-            f"| {comp['file']} | {comp['scale_factor']} | {comp['format']} | {base_str} | "
-            f"{format_size(comp['head_size'])} | {format_change(comp['change'])} | {pct_str} |"
-        )
-
-    # Output totals
-    print("")
-    print("**Totals:**")
-    for fmt in sorted(format_totals.keys()):
-        totals = format_totals[fmt]
-        base_total = totals["base"]
-        head_total = totals["head"]
-        if base_total > 0:
-            total_pct = (head_total / base_total - 1) * 100
-            pct_str = f" ({format_pct_change(total_pct)})"
-        else:
-            pct_str = ""
-        print(f"- {fmt}: {format_size(base_total)} \u2192 {format_size(head_total)}{pct_str}")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/vortex-bench/src/statpopgen/statpopgen_benchmark.rs b/vortex-bench/src/statpopgen/statpopgen_benchmark.rs
index dc5fa6448ad..941859dc90b 100644
--- a/vortex-bench/src/statpopgen/statpopgen_benchmark.rs
+++ b/vortex-bench/src/statpopgen/statpopgen_benchmark.rs
@@ -52,7 +52,7 @@ impl StatPopGenBenchmark {
             )
         })?;
 
-        let data_path = "statpopgen".to_data_path().join(format!("{n_rows}/"));
+        let data_path = "statspopgen".to_data_path().join(format!("{n_rows}/"));
 
         let data_url =
             Url::from_directory_path(data_path).map_err(|_| anyhow::anyhow!("bad data path?"))?;