ppsp-team · Ramdam17 · Apr 17, 2026 · Apr 14, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,11 +4,21 @@
 
 ### Added
 - **New `hypyp.sync` module**: Modular architecture for connectivity metrics
-  - Extracted 9 connectivity metrics into separate classes: `PLV`, `CCorr`, `ACorr`, `Coh`, `ImCoh`, `PLI`, `WPLI`, `EnvCorr`, `PowCorr`
+  - Extracted 9 connectivity metrics into separate classes: `PLV`, `CCorr`, `ACCorr`, `Coh`, `ImCoh`, `PLI`, `WPLI`, `EnvCorr`, `PowCorr`
   - `BaseMetric` abstract class for uniform interface across all metrics
-  - `get_metric(mode, backend)` function for easy metric instantiation
-  - Backend support infrastructure (numpy default, with future support for numba/torch)
+  - `get_metric(mode, optimization)` function for easy metric instantiation
   - Helper functions: `multiply_conjugate`, `multiply_conjugate_time`, `multiply_product`
+- **GPU and numba backends for all 9 sync metrics**:
+  - numba JIT with `prange`: PLV, CCorr, Coh, ImCoh, PLI, wPLI, EnvCorr, PowCorr
+  - PyTorch (MPS/CUDA/CPU) via batched einsum: all 9 metrics
+  - Metal compute shaders (Apple Silicon): PLI, wPLI, ACCorr
+  - CUDA raw kernels via CuPy (NVIDIA GPUs): all 9 metrics
+- Benchmark-driven `AUTO_PRIORITY` table for `optimization='auto'`, compiled from
+  Mac M4 Max (131 runs) and Narval A100 (111 runs) benchmarks
+- `priority` parameter on `get_metric()` and `compute_sync()` for custom backend ordering
+- `hypyp/sync/kernels/` submodule with Metal and CUDA dispatch infrastructure
+- New optional dependencies: `pyobjc-framework-Metal` (Apple), `cupy-cuda12x` (NVIDIA)
+- `multiply_conjugate_torch` and `multiply_conjugate_time_torch` GPU helpers
 
 ### Changed
 - **BREAKING**: `accorr` metric now returns raw connectivity values with shape `(n_epoch, n_freq, 2*n_ch, 2*n_ch)` like all other metrics. The `swapaxes` and `epochs_average` operations are now handled by `compute_sync()` instead of being applied inside the metric.
@@ -18,7 +28,7 @@
 - `_multiply_conjugate()` in analyses.py - use `hypyp.sync.multiply_conjugate` instead (will be removed in 1.0.0)
 - `_multiply_conjugate_time()` in analyses.py - use `hypyp.sync.multiply_conjugate_time` instead (will be removed in 1.0.0)
 - `_multiply_product()` in analyses.py - use `hypyp.sync.multiply_product` instead (will be removed in 1.0.0)
-- `_accorr_hybrid()` in analyses.py - use `hypyp.sync.ACorr` instead (will be removed in 1.0.0)
+- `_accorr_hybrid()` in analyses.py - use `hypyp.sync.ACCorr` instead (will be removed in 1.0.0)
 
 ## [0.5.0b13] - 2025-09-18
 

diff --git a/hypyp/analyses.py b/hypyp/analyses.py
@@ -439,7 +439,8 @@ def pair_connectivity(data: Union[list, np.ndarray], sampling_rate: int,
 
 
 def compute_sync(complex_signal: np.ndarray, mode: str, epochs_average: bool = True,
-                 optimization: Optional[str] = None) -> np.ndarray:
+                 optimization: Optional[str] = None,
+                 priority: Optional[list] = None) -> np.ndarray:
     """
     Computes frequency-domain connectivity measures from analytic signals.
 
@@ -547,7 +548,7 @@ def compute_sync(complex_signal: np.ndarray, mode: str, epochs_average: bool = T
 
     # Get the metric from the sync module
     try:
-        metric = get_metric(mode_normalized, optimization=optimization)
+        metric = get_metric(mode_normalized, optimization=optimization, priority=priority)
         con = metric.compute(complex_signal, n_samp, transpose_axes)
     except ValueError:
         raise ValueError(f'Metric type "{mode}" not supported.')

diff --git a/hypyp/sync/README.md b/hypyp/sync/README.md
@@ -58,9 +58,6 @@ Arbitrary methodological decisions skew inter-brain synchronization estimates
 in hyperscanning-EEG studies. *Imaging Neuroscience*, 2.
 https://doi.org/10.1162/imag_a_00350
 
-**Note:** ACCorr supports hardware acceleration via `optimization` parameter.
-See [Optimization Backends](#optimization-backends) below.
-
 ---
 
 ### Coherence (`coh`)
@@ -165,24 +162,110 @@ amplitude. More sensitive to high-amplitude bursts.
 
 ## Optimization Backends
 
-ACCorr supports three computational backends via the `optimization` parameter
-in `compute_sync()` or the class constructor:
+All 9 metrics support multiple computational backends via the `optimization`
+parameter in `compute_sync()` or the class constructor.
+
+### Backend Support Matrix
+
+| Metric | numpy | numba | torch | metal | cuda_kernel |
+|--------|:-----:|:-----:|:-----:|:-----:|:-----------:|
+| PLV    |   x   |   x   |   x   |   --  |      x      |
+| CCorr  |   x   |   x   |   x   |   --  |      x      |
+| Coh    |   x   |   x   |   x   |   --  |      x      |
+| ImCoh  |   x   |   x   |   x   |   --  |      x      |
+| EnvCorr|   x   |   x   |   x   |   --  |      x      |
+| PowCorr|   x   |   x   |   x   |   --  |      x      |
+| PLI    |   x   |   x   |   x   |   x   |      x      |
+| wPLI   |   x   |   x   |   x   |   x   |      x      |
+| ACCorr |   x   |   x   |   x   |   x   |      x      |
+
+### Backend Descriptions
 
 | Value | Backend | Device | Notes |
 |-------|---------|--------|-------|
 | `None` (default) | NumPy | CPU | Standard, no extra dependencies |
-| `'auto'` | Best available | Auto | torch → numba → numpy |
-| `'numba'` | Numba JIT | CPU | ~2× speedup; install: `poetry install --with optim_numba` |
-| `'torch'` | PyTorch | GPU/CPU | ~20× speedup on GPU; install: `poetry install --with optim_torch` |
+| `'auto'` | Best available | Auto | Selects best GPU backend per metric and platform |
+| `'numba'` | Numba JIT | CPU | Fused single-pass kernels with `prange` parallelism |
+| `'torch'` | PyTorch | GPU/CPU | Batched einsum; MPS (Apple) / CUDA (NVIDIA) / CPU |
+| `'metal'` | Metal shaders | Apple GPU | Custom compute shaders for PLI, wPLI, ACCorr only |
+| `'cuda_kernel'` | CuPy RawKernel | NVIDIA GPU | Custom CUDA kernels; float64 precision |
+
+### `optimization='auto'` — Benchmark-Driven Dispatch
+
+The `'auto'` mode selects the best GPU backend for each metric based on
+benchmark data compiled from Mac M4 Max (131 runs) and Narval A100 (111 runs).
+
+**MPS (Apple Silicon):**
+- Einsum metrics (PLV, CCorr, Coh, ImCoh, EnvCorr, PowCorr): torch (batched BLAS)
+- Sign-based (PLI, wPLI) + ACCorr: Metal custom kernels
+
+**CUDA (NVIDIA):**
+- All metrics: `cuda_kernel` first (pairwise computation, OOM-safe at 512+ channels),
+  with torch as fallback.
+
+The priority can be overridden per-call:
+```python
+get_metric('plv', optimization='auto', priority=['torch', 'cuda_kernel'])
+```
+
+If no GPU backend is available, `'auto'` falls back to numba, then numpy.
+
+### Precision
+
+- **CPU / CUDA (`float64`):** reference precision, `rtol=1e-9, atol=1e-10`
+- **MPS / Metal (`float32`):** up to ~1e-5 difference vs CPU reference.
+  Sign-based metrics (PLI, wPLI) may show larger differences (`rtol=1e-2`)
+  near the sign discontinuity at zero.
+
+---
+
+## Architecture
 
-**Device priority for `'torch'` and `'auto'`:** MPS (Apple Silicon) > CUDA (NVIDIA) > CPU.
-MPS and CUDA are mutually exclusive; the best available device is selected automatically.
+```
+hypyp/sync/
+├── __init__.py          # Registry, get_metric(), exports
+├── base.py              # BaseMetric, AUTO_PRIORITY, helpers
+├── plv.py ... wpli.py   # One file per metric (9 files)
+└── kernels/             # Custom GPU kernels
+    ├── __init__.py      # METAL_AVAILABLE, CUPY_AVAILABLE flags
+    ├── _metal_dispatch.py   # Shared Metal pairwise dispatch
+    ├── _cuda_dispatch.py    # Shared CUDA pairwise dispatch
+    ├── metal_phase.py       # PLI, wPLI Metal shaders
+    ├── metal_accorr.py      # ACCorr Metal shader
+    ├── cuda_phase.py        # PLI, wPLI, PLV, CCorr CUDA kernels
+    ├── cuda_amplitude.py    # Coh, ImCoh, EnvCorr, PowCorr CUDA kernels
+    └── cuda_accorr.py       # ACCorr CUDA kernel
+```
 
-**Precision note:** MPS uses `float32`, which may introduce numerical differences
-of up to ~1e-5 compared to CPU/CUDA (`float64`).
+Each metric class inherits from `BaseMetric` and implements:
+- `_compute_numpy()` — always available (reference implementation)
+- `_compute_numba()` — fused loop with `numba.prange` parallelism
+- `_compute_torch()` — batched einsum on auto-detected device
+- `_compute_metal()` — Metal shader dispatch (PLI, wPLI, ACCorr only)
+- `_compute_cuda()` — CUDA RawKernel dispatch
 
-All other metrics currently use numpy only (`optimization` parameter is accepted
-but ignored for non-ACCorr metrics).
+Backend selection happens at `__init__()`, dispatch at `compute()`.
+
+---
+
+## Installation
+
+```bash
+# Core (numpy backend always available)
+pip install hypyp
+
+# CPU parallelism
+pip install "hypyp[numba]"
+
+# GPU acceleration (PyTorch)
+pip install "hypyp[torch]"
+
+# Apple Silicon Metal shaders (PLI, wPLI, ACCorr)
+pip install "hypyp[metal]"
+
+# NVIDIA CUDA kernels (all metrics, requires CUDA 12.x)
+pip install "hypyp[cupy]"
+```
 
 ---
 
@@ -192,13 +275,20 @@ but ignored for non-ACCorr metrics).
 from hypyp.analyses import compute_sync
 
 # Standard (numpy)
-con = compute_sync(complex_signal, 'accorr')
+con = compute_sync(complex_signal, 'plv')
+
+# Best available GPU backend
+con = compute_sync(complex_signal, 'plv', optimization='auto')
+
+# Specific backend
+con = compute_sync(complex_signal, 'pli', optimization='metal')
 
-# With GPU acceleration
-con = compute_sync(complex_signal, 'accorr', optimization='torch')
+# Custom priority
+con = compute_sync(complex_signal, 'coh', optimization='auto',
+                   priority=['torch', 'cuda_kernel'])
 
 # Direct class instantiation
-from hypyp.sync import ACCorr
-metric = ACCorr(optimization='auto', show_progress=True)
+from hypyp.sync import get_metric
+metric = get_metric('accorr', optimization='auto')
 con = metric.compute(complex_signal_internal, n_samp, transpose_axes)
 ```
diff --git a/hypyp/sync/__init__.py b/hypyp/sync/__init__.py
@@ -10,7 +10,10 @@
 
 from typing import Optional
 
-from .base import BaseMetric, multiply_conjugate, multiply_conjugate_time, multiply_product
+from .base import (
+    BaseMetric, multiply_conjugate, multiply_conjugate_time, multiply_product,
+    multiply_conjugate_torch, multiply_conjugate_time_torch,
+)
 from .plv import PLV
 from .ccorr import CCorr
 from .accorr import ACCorr
@@ -40,6 +43,8 @@
     'multiply_conjugate',
     'multiply_conjugate_time',
     'multiply_product',
+    'multiply_conjugate_torch',
+    'multiply_conjugate_time_torch',
     # Metric classes
     'PLV',
     'CCorr',
@@ -56,7 +61,8 @@
 ]
 
 
-def get_metric(mode: str, optimization: Optional[str] = None) -> BaseMetric:
+def get_metric(mode: str, optimization: Optional[str] = None,
+               priority: Optional[list] = None) -> BaseMetric:
     """
     Get a connectivity metric instance by name.
 
@@ -66,8 +72,11 @@ def get_metric(mode: str, optimization: Optional[str] = None) -> BaseMetric:
         Name of the connectivity metric. One of: 'plv', 'ccorr', 'accorr',
         'coh', 'imcoh', 'pli', 'wpli', 'envcorr', 'powcorr'.
     optimization : str, optional
-        Optimization strategy. Options: None, 'auto', 'numba', 'torch'.
-        See BaseMetric for fallback behavior.
+        Optimization strategy. Options: None, 'auto', 'numba', 'torch',
+        'metal', 'cuda_kernel'. See BaseMetric for fallback behavior.
+    priority : list of str, optional
+        Custom backend priority for ``'auto'`` mode. Overrides the default
+        ``AUTO_PRIORITY`` table. Example: ``['metal', 'torch', 'numba']``.
 
     Returns
     -------
@@ -82,12 +91,13 @@ def get_metric(mode: str, optimization: Optional[str] = None) -> BaseMetric:
     Examples
     --------
     >>> from hypyp.sync import get_metric
-    >>> accorr = get_metric('accorr', optimization='torch')
-    >>> result = accorr.compute(complex_signal, n_samp, transpose_axes)
+    >>> plv = get_metric('plv', optimization='auto')          # benchmark-driven
+    >>> pli = get_metric('pli', optimization='auto',
+    ...                  priority=['numba', 'metal'])          # custom priority
     """
     mode_lower = mode.lower()
     if mode_lower not in METRICS:
         available = ', '.join(METRICS.keys())
         raise ValueError(f"Unknown metric mode '{mode}'. Available: {available}")
 
-    return METRICS[mode_lower](optimization=optimization)
+    return METRICS[mode_lower](optimization=optimization, priority=priority)