From 2615c1690f25531b03d77cf77ca3f6b0df227265 Mon Sep 17 00:00:00 2001 From: robertvava Date: Sat, 7 Mar 2026 18:21:54 +0000 Subject: [PATCH 1/9] PCA init --- src/hyperview/cli.py | 4 +- src/hyperview/core/dataset.py | 9 +- src/hyperview/embeddings/pipelines.py | 20 +- src/hyperview/embeddings/projection.py | 256 +++++++++++++++++++++- tests/__init__.py | 0 tests/test_pca_projection.py | 284 +++++++++++++++++++++++++ 6 files changed, 554 insertions(+), 19 deletions(-) create mode 100644 tests/__init__.py create mode 100644 tests/test_pca_projection.py diff --git a/src/hyperview/cli.py b/src/hyperview/cli.py index d2da15b..8e30cb1 100644 --- a/src/hyperview/cli.py +++ b/src/hyperview/cli.py @@ -95,9 +95,9 @@ def _build_parser() -> argparse.ArgumentParser: ) parser.add_argument( "--method", - choices=["umap"], + choices=["umap", "pca"], default="umap", - help="Projection method (currently only 'umap')", + help="Projection method: 'umap' (default) or 'pca'", ) parser.add_argument( "--geometry", diff --git a/src/hyperview/core/dataset.py b/src/hyperview/core/dataset.py index 09cba36..76cc0e9 100644 --- a/src/hyperview/core/dataset.py +++ b/src/hyperview/core/dataset.py @@ -439,11 +439,12 @@ def compute_visualization( Args: space_key: Embedding space to project. If None, uses the first available. - method: Projection method ('umap' supported). + method: Projection method ('umap' or 'pca'). PCA is faster and + deterministic; UMAP captures nonlinear structure better. geometry: Output geometry type ('euclidean' or 'poincare'). - n_neighbors: Number of neighbors for UMAP. - min_dist: Minimum distance for UMAP. - metric: Distance metric for UMAP. + n_neighbors: Number of neighbors for UMAP (ignored for PCA). + min_dist: Minimum distance for UMAP (ignored for PCA). + metric: Distance metric for UMAP (ignored for PCA). force: Force recomputation even if layout exists. Returns: diff --git a/src/hyperview/embeddings/pipelines.py b/src/hyperview/embeddings/pipelines.py index e3aa716..a46e52a 100644 --- a/src/hyperview/embeddings/pipelines.py +++ b/src/hyperview/embeddings/pipelines.py @@ -106,7 +106,7 @@ def compute_layout( Args: storage: Storage backend with embeddings. space_key: Embedding space to project. If None, uses the first available. - method: Projection method ('umap' supported). + method: Projection method ('umap' or 'pca'). geometry: Output geometry type ('euclidean' or 'poincare'). n_neighbors: Number of neighbors for UMAP. min_dist: Minimum distance for UMAP. @@ -122,8 +122,8 @@ def compute_layout( """ from hyperview.embeddings.projection import ProjectionEngine - if method != "umap": - raise ValueError(f"Invalid method: {method}. Only 'umap' is supported.") + if method not in ("umap", "pca"): + raise ValueError(f"Invalid method: {method}. Supported methods: 'umap', 'pca'.") if geometry not in ("euclidean", "poincare"): raise ValueError(f"Invalid geometry: {geometry}. Must be 'euclidean' or 'poincare'.") @@ -157,11 +157,15 @@ def compute_layout( if len(ids) < 3: raise ValueError(f"Need at least 3 samples for visualization, have {len(ids)}") - layout_params = { - "n_neighbors": n_neighbors, - "min_dist": min_dist, - "metric": metric, - } + if method == "umap": + layout_params = { + "n_neighbors": n_neighbors, + "min_dist": min_dist, + "metric": metric, + } + else: + # PCA is deterministic with no tuning parameters + layout_params = {} layout_key = make_layout_key(space_key, method, geometry, layout_params) if not force: diff --git a/src/hyperview/embeddings/projection.py b/src/hyperview/embeddings/projection.py index 83dff50..68892e5 100644 --- a/src/hyperview/embeddings/projection.py +++ b/src/hyperview/embeddings/projection.py @@ -8,10 +8,124 @@ logger = logging.getLogger(__name__) +_MIN_SAMPLES = 2 +_EPS = 1e-7 class ProjectionEngine: """Engine for projecting high-dimensional embeddings to 2D.""" + def logmap_0_hyperboloid( + self, + embeddings: np.ndarray, + curvature: float = 1.0, + ) -> np.ndarray: + """Logarithmic map from the hyperboloid to the tangent space at the origin. + + Maps points on the hyperboloid H^D (Lorentz model) to the tangent space + T_o H^D at the origin o = (1, 0, ..., 0). The tangent space is Euclidean + R^D, so standard PCA can be applied afterward. + + Simplification at the origin: + _L = -x_0 (Lorentz inner product with north pole) + d_L(o, x) = arccosh(x_0) + spatial = x[1:] + tangent = d_L * spatial / ||spatial|| (if ||spatial|| > 0) + + Args: + embeddings: (N, D+1) array on the hyperboloid (first column is time-like). + curvature: Positive curvature parameter c (sectional curvature is -c). + + Returns: + (N, D) array of tangent vectors at the origin. + """ + if embeddings.ndim != 2 or embeddings.shape[1] < 2: + raise ValueError( + "embeddings must have shape (N, D+1) with D >= 1, " + f"got {embeddings.shape}" + ) + + c = float(curvature) + sqrt_c = np.sqrt(c) + + t = embeddings[:, 0] # time component, shape (N,) + x = embeddings[:, 1:] # spatial components, shape (N, D) + + # Hyperbolic distance from origin: d = (1/sqrt(c)) * arccosh(sqrt(c) * t) + # For c=1 this simplifies to arccosh(t). + arg = np.clip(sqrt_c * t, 1.0 + _EPS, None) # ensure > 1 for arccosh + theta = np.arccosh(arg) / sqrt_c # shape (N,) + + norm_x = np.linalg.norm(x, axis=1) # shape (N,) + # For points very close to the origin, norm_x ≈ 0 and theta ≈ 0. + # The tangent vector is simply 0 in that case. + safe_norm = np.where(norm_x > _EPS, norm_x, 1.0) + scale = np.where(norm_x > _EPS, theta / safe_norm, 0.0) + + tangent = x * scale[:, np.newaxis] + + logger.debug( + "logmap_0: input %s -> tangent %s, max_dist=%.4f", + embeddings.shape, + tangent.shape, + float(theta.max()) if len(theta) else 0.0, + ) + return tangent.astype(np.float32) + + def expmap_0_hyperboloid( + self, + tangent_vectors: np.ndarray, + curvature: float = 1.0, + ) -> np.ndarray: + """Exponential map from the tangent space at the origin back to the hyperboloid. + + Inverse of ``logmap_0_hyperboloid``. + + expmap_0(v): + norm_v = ||v|| + t = cosh(sqrt(c) * norm_v) / sqrt(c) ... but for unit hyperboloid (c=1): t = cosh(norm_v) + spatial = sinh(sqrt(c) * norm_v) / (sqrt(c) * norm_v) * v + + For c = 1: + t = cosh(norm_v) + spatial = sinh(norm_v) * v / norm_v + + Args: + tangent_vectors: (N, D) array in the tangent space at the origin. + curvature: Positive curvature parameter c. + + Returns: + (N, D+1) array of points on the hyperboloid. + """ + if tangent_vectors.ndim != 2: + raise ValueError( + f"tangent_vectors must be 2-D, got shape {tangent_vectors.shape}" + ) + + c = float(curvature) + sqrt_c = np.sqrt(c) + + norm_v = np.linalg.norm(tangent_vectors, axis=1) # (N,) + scaled_norm = sqrt_c * norm_v + + t = np.cosh(scaled_norm) / sqrt_c # time component + # sinh(x)/x -> 1 as x -> 0 + safe_scaled = np.where(scaled_norm > _EPS, scaled_norm, 1.0) + coeff = np.where( + scaled_norm > _EPS, + np.sinh(scaled_norm) / safe_scaled, + 1.0, + ) + spatial = tangent_vectors * coeff[:, np.newaxis] + + result = np.column_stack([t, spatial]) + + logger.debug( + "expmap_0: tangent %s -> hyperboloid %s", + tangent_vectors.shape, + result.shape, + ) + return result.astype(np.float32) + def to_poincare_ball( self, hyperboloid_embeddings: np.ndarray, @@ -52,7 +166,6 @@ def to_poincare_ball( # Rescale to unit ball: u = u_R / R = sqrt(c) * u_R u = u_R / R - # Numerical guard: ensure inside the unit ball radii = np.linalg.norm(u, axis=1) mask = radii >= clamp_radius if np.any(mask): @@ -77,14 +190,14 @@ def project( This separates two concerns: 1) Geometry/model transforms for the *input* embeddings (e.g. hyperboloid -> Poincaré) - 2) Dimensionality reduction / layout (currently UMAP) + 2) Dimensionality reduction / layout (UMAP or PCA) Args: embeddings: Input embeddings (N x D) or hyperboloid (N x D+1). input_geometry: Geometry/model of the input embeddings (euclidean, hyperboloid). output_geometry: Geometry of the output coordinates (euclidean, poincare). curvature: Curvature parameter for hyperbolic embeddings (positive c). - method: Layout method (currently only 'umap'). + method: Layout method ('umap' or 'pca'). n_neighbors: UMAP neighbors. min_dist: UMAP min_dist. metric: Input metric (used for euclidean inputs). @@ -93,8 +206,18 @@ def project( Returns: 2D coordinates (N x 2). """ - if method != "umap": - raise ValueError(f"Invalid method: {method}. Only 'umap' is supported.") + if method not in ("umap", "pca"): + raise ValueError( + f"Invalid method: {method}. Supported methods: 'umap', 'pca'." + ) + + if method == "pca": + return self._project_pca( + embeddings, + input_geometry=input_geometry, + output_geometry=output_geometry, + curvature=curvature or 1.0, + ) prepared = embeddings prepared_metric: str = metric @@ -127,6 +250,129 @@ def project( f"Invalid output_geometry: {output_geometry}. Must be 'euclidean' or 'poincare'." ) + + def _project_pca( + self, + embeddings: np.ndarray, + *, + input_geometry: str, + output_geometry: str, + curvature: float = 1.0, + ) -> np.ndarray: + """Dispatch PCA projection based on input/output geometry. + + Handles four combinations: + - euclidean -> euclidean: Standard PCA, normalize to [-1, 1] + - euclidean -> poincare: Standard PCA, tanh disk mapping + - hyperboloid -> euclidean: logmap_0, PCA, normalize to [-1, 1] + - hyperboloid -> poincare: logmap_0, PCA to 2D, expmap_0 back, stereographic to disk + """ + n_samples = embeddings.shape[0] + if n_samples < _MIN_SAMPLES: + raise ValueError( + f"PCA requires at least {_MIN_SAMPLES} samples, got {n_samples}." + ) + + if not np.all(np.isfinite(embeddings)): + raise ValueError("Embeddings contain NaN or Inf values.") + + logger.info( + "PCA: input_geometry=%s, output_geometry=%s, N=%d, D=%d", + input_geometry, + output_geometry, + n_samples, + embeddings.shape[1], + ) + + if input_geometry == "hyperboloid": + data = self.logmap_0_hyperboloid(embeddings, curvature=curvature) + else: + data = embeddings + + pca_2d, explained = self._pca_svd(data, n_components=2) + + logger.info("PCA: explained variance ratio: [%.4f, %.4f]", *explained) + + if output_geometry == "poincare": + if input_geometry == "hyperboloid": + # Tangent PCA -> expmap back to H^2 -> stereographic to Poincaré disk + hyp_points = self.expmap_0_hyperboloid(pca_2d, curvature=curvature) + coords = self.to_poincare_ball(hyp_points, curvature=curvature) + coords = self._center_poincare(coords) + coords = self._scale_poincare(coords, factor=0.65) + else: + # Euclidean PCA -> tanh disk mapping + coords = self._pca_to_poincare_disk(pca_2d) + return coords.astype(np.float32) + + return self._normalize_coords(pca_2d).astype(np.float32) + + def _pca_svd( + self, + data: np.ndarray, + n_components: int = 2, + ) -> tuple[np.ndarray, np.ndarray]: + """Compute PCA via NumPy SVD. No scikit-learn dependency. + + Args: + data: (N, D) array of data points. + n_components: Number of principal components to keep. + + Returns: + Tuple of: + - (N, n_components) projected data + - (n_components,) explained variance ratios + """ + data = data.astype(np.float64) + mean = data.mean(axis=0) + centered = data - mean + + # Economy SVD (full_matrices=False) is O(N * D * min(N,D)) + _U, S, Vt = np.linalg.svd(centered, full_matrices=False) + + projected = centered @ Vt[:n_components].T + + variance = S**2 / max(len(data) - 1, 1) + total_var = variance.sum() + if total_var > 0: + explained = variance[:n_components] / total_var + else: + explained = np.zeros(n_components) + + return projected.astype(np.float32), explained + + def _pca_to_poincare_disk( + self, + coords_2d: np.ndarray, + alpha: float = 1.0, + ) -> np.ndarray: + """Map 2D Euclidean PCA output into the Poincaré disk via tanh scaling. + + tanh(alpha * r) * z / r for each point z with r = ||z|| + + This is a homeomorphism R^2 -> B^2 (open unit ball). + + Args: + coords_2d: (N, 2) PCA coordinates. + alpha: Scaling factor controlling radial compression (default 1.0). + + Returns: + (N, 2) Poincaré disk coordinates. + """ + max_abs = np.abs(coords_2d).max() + if max_abs > _EPS: + coords_2d = coords_2d / max_abs + + radii = np.linalg.norm(coords_2d, axis=1) + safe_radii = np.where(radii > _EPS, radii, 1.0) + mapped_radii = np.tanh(alpha * radii) + scale = np.where(radii > _EPS, mapped_radii / safe_radii, 0.0) + poincare = coords_2d * scale[:, np.newaxis] + poincare = self._center_poincare(poincare) + poincare = self._scale_poincare(poincare, factor=0.65) + + return poincare.astype(np.float32) + def project_umap( self, embeddings: np.ndarray, diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_pca_projection.py b/tests/test_pca_projection.py new file mode 100644 index 0000000..3581810 --- /dev/null +++ b/tests/test_pca_projection.py @@ -0,0 +1,284 @@ +import numpy as np +import pytest + +from hyperview.embeddings.projection import ProjectionEngine + + +def _random_euclidean(n: int = 50, d: int = 64, seed: int = 42) -> np.ndarray: + """Generate random Euclidean embeddings (N, D).""" + rng = np.random.default_rng(seed) + return rng.standard_normal((n, d)).astype(np.float32) + + +def _random_hyperboloid(n: int = 50, d: int = 64, seed: int = 42) -> np.ndarray: + """Generate random points on the hyperboloid H^d (Lorentz model). + + Points satisfy t^2 - ||x||^2 = 1 (curvature c=1). + We sample spatial components from a normal distribution scaled down, + then compute t = sqrt(1 + ||x||^2). + """ + rng = np.random.default_rng(seed) + x = rng.standard_normal((n, d)).astype(np.float64) * 0.5 + t = np.sqrt(1.0 + np.sum(x**2, axis=1, keepdims=True)) + return np.hstack([t, x]).astype(np.float32) + + +class TestPcaEuclideanToEuclidean: + """Euclidean input -> Euclidean output (standard PCA).""" + + def test_output_shape(self): + engine = ProjectionEngine() + data = _random_euclidean(30, 32) + coords = engine.project(data, method="pca", input_geometry="euclidean", output_geometry="euclidean") + assert coords.shape == (30, 2) + + def test_normalized_range(self): + engine = ProjectionEngine() + data = _random_euclidean(50, 64) + coords = engine.project(data, method="pca", input_geometry="euclidean", output_geometry="euclidean") + assert np.all(coords >= -1.0) + assert np.all(coords <= 1.0) + + def test_dtype_float32(self): + engine = ProjectionEngine() + data = _random_euclidean(20, 16) + coords = engine.project(data, method="pca", input_geometry="euclidean", output_geometry="euclidean") + assert coords.dtype == np.float32 + + +class TestPcaEuclideanToPoincare: + """Euclidean input -> Poincare disk output.""" + + def test_output_shape(self): + engine = ProjectionEngine() + data = _random_euclidean(30, 32) + coords = engine.project(data, method="pca", input_geometry="euclidean", output_geometry="poincare") + assert coords.shape == (30, 2) + + def test_inside_unit_disk(self): + engine = ProjectionEngine() + data = _random_euclidean(50, 64) + coords = engine.project(data, method="pca", input_geometry="euclidean", output_geometry="poincare") + radii = np.linalg.norm(coords, axis=1) + assert np.all(radii < 1.0), f"Max radius: {radii.max()}" + + +class TestPcaHyperboloidToEuclidean: + """Hyperboloid input -> Euclidean output (tangent PCA).""" + + def test_output_shape(self): + engine = ProjectionEngine() + data = _random_hyperboloid(30, 32) + coords = engine.project(data, method="pca", input_geometry="hyperboloid", output_geometry="euclidean") + assert coords.shape == (30, 2) + + def test_normalized_range(self): + engine = ProjectionEngine() + data = _random_hyperboloid(50, 64) + coords = engine.project(data, method="pca", input_geometry="hyperboloid", output_geometry="euclidean") + assert np.all(coords >= -1.0) + assert np.all(coords <= 1.0) + + +class TestPcaHyperboloidToPoincare: + """Hyperboloid input -> Poincare disk output (tangent PCA + expmap).""" + + def test_output_shape(self): + engine = ProjectionEngine() + data = _random_hyperboloid(30, 32) + coords = engine.project(data, method="pca", input_geometry="hyperboloid", output_geometry="poincare") + assert coords.shape == (30, 2) + + def test_inside_unit_disk(self): + engine = ProjectionEngine() + data = _random_hyperboloid(50, 64) + coords = engine.project(data, method="pca", input_geometry="hyperboloid", output_geometry="poincare") + radii = np.linalg.norm(coords, axis=1) + assert np.all(radii < 1.0), f"Max radius: {radii.max()}" + + +class TestPcaDeterminism: + """PCA should produce identical results on identical input.""" + + def test_euclidean_deterministic(self): + engine = ProjectionEngine() + data = _random_euclidean(40, 32) + c1 = engine.project(data, method="pca", input_geometry="euclidean", output_geometry="euclidean") + c2 = engine.project(data, method="pca", input_geometry="euclidean", output_geometry="euclidean") + np.testing.assert_array_equal(c1, c2) + + def test_hyperboloid_deterministic(self): + engine = ProjectionEngine() + data = _random_hyperboloid(40, 32) + c1 = engine.project(data, method="pca", input_geometry="hyperboloid", output_geometry="poincare") + c2 = engine.project(data, method="pca", input_geometry="hyperboloid", output_geometry="poincare") + np.testing.assert_array_equal(c1, c2) + + +class TestLogmapExpmapRoundtrip: + """logmap_0(expmap_0(v)) should approximately recover v.""" + + def test_roundtrip(self): + engine = ProjectionEngine() + rng = np.random.default_rng(123) + # Tangent vectors with moderate norms + v = rng.standard_normal((20, 8)).astype(np.float32) * 0.5 + hyp = engine.expmap_0_hyperboloid(v, curvature=1.0) + recovered = engine.logmap_0_hyperboloid(hyp, curvature=1.0) + np.testing.assert_allclose(recovered, v, atol=1e-4) + + def test_roundtrip_nonunit_curvature(self): + engine = ProjectionEngine() + rng = np.random.default_rng(456) + v = rng.standard_normal((15, 5)).astype(np.float32) * 0.3 + c = 2.0 + hyp = engine.expmap_0_hyperboloid(v, curvature=c) + recovered = engine.logmap_0_hyperboloid(hyp, curvature=c) + np.testing.assert_allclose(recovered, v, atol=1e-4) + + +class TestEdgeCases: + + def test_insufficient_samples(self): + engine = ProjectionEngine() + data = _random_euclidean(1, 16) + with pytest.raises(ValueError, match="at least"): + engine.project(data, method="pca", input_geometry="euclidean", output_geometry="euclidean") + + def test_nan_input_raises(self): + engine = ProjectionEngine() + data = _random_euclidean(10, 16) + data[3, 5] = np.nan + with pytest.raises(ValueError, match="NaN or Inf"): + engine.project(data, method="pca", input_geometry="euclidean", output_geometry="euclidean") + + def test_inf_input_raises(self): + engine = ProjectionEngine() + data = _random_euclidean(10, 16) + data[0, 0] = np.inf + with pytest.raises(ValueError, match="NaN or Inf"): + engine.project(data, method="pca", input_geometry="euclidean", output_geometry="euclidean") + + def test_identical_embeddings(self): + """All-same embeddings -> zero variance -> should not crash.""" + engine = ProjectionEngine() + data = np.ones((10, 16), dtype=np.float32) + coords = engine.project(data, method="pca", input_geometry="euclidean", output_geometry="euclidean") + assert coords.shape == (10, 2) + assert np.allclose(coords, 0.0, atol=1e-6) + + def test_invalid_method(self): + engine = ProjectionEngine() + data = _random_euclidean(10, 16) + with pytest.raises(ValueError, match="Invalid method"): + engine.project(data, method="tsne", input_geometry="euclidean", output_geometry="euclidean") + + def test_two_samples_minimal(self): + """Exactly 2 samples — the minimum for PCA.""" + engine = ProjectionEngine() + data = _random_euclidean(2, 16) + coords = engine.project(data, method="pca", input_geometry="euclidean", output_geometry="euclidean") + assert coords.shape == (2, 2) + + +class TestIntegrationDatasetApi: + + def test_compute_visualization_pca_euclidean(self): + from hyperview.core.dataset import Dataset + from hyperview.core.sample import Sample + + ds = Dataset("test_pca", persist=False) + + for i in range(20): + ds.add_sample(Sample(id=f"s{i}", filepath=f"/fake/{i}.png", label=f"c{i % 3}")) + + space_key = "test_space" + ds._storage.ensure_space( + model_id="test", + dim=32, + config={"provider": "test", "geometry": "euclidean"}, + space_key=space_key, + ) + rng = np.random.default_rng(99) + ids = [f"s{i}" for i in range(20)] + vectors = rng.standard_normal((20, 32)).astype(np.float32) + ds._storage.add_embeddings(space_key, ids, vectors) + + layout_key = ds.compute_visualization( + space_key=space_key, + method="pca", + geometry="euclidean", + ) + + assert layout_key is not None + vis_ids, vis_labels, vis_coords = ds.get_visualization_data(layout_key) + assert len(vis_ids) == 20 + assert vis_coords.shape == (20, 2) + + def test_compute_visualization_pca_poincare(self): + from hyperview.core.dataset import Dataset + from hyperview.core.sample import Sample + + ds = Dataset("test_pca_poinc", persist=False) + + for i in range(20): + ds.add_sample(Sample(id=f"s{i}", filepath=f"/fake/{i}.png", label=f"c{i % 3}")) + + space_key = "test_space" + ds._storage.ensure_space( + model_id="test", + dim=32, + config={"provider": "test", "geometry": "euclidean"}, + space_key=space_key, + ) + rng = np.random.default_rng(99) + ids = [f"s{i}" for i in range(20)] + vectors = rng.standard_normal((20, 32)).astype(np.float32) + ds._storage.add_embeddings(space_key, ids, vectors) + + layout_key = ds.compute_visualization( + space_key=space_key, + method="pca", + geometry="poincare", + ) + + assert layout_key is not None + vis_ids, _, vis_coords = ds.get_visualization_data(layout_key) + assert len(vis_ids) == 20 + radii = np.linalg.norm(vis_coords, axis=1) + assert np.all(radii < 1.0) + + def test_compute_visualization_pca_hyperboloid(self): + """Integration test with hyperboloid input geometry.""" + from hyperview.core.dataset import Dataset + from hyperview.core.sample import Sample + + ds = Dataset("test_pca_hyp", persist=False) + + for i in range(20): + ds.add_sample(Sample(id=f"s{i}", filepath=f"/fake/{i}.png", label=f"c{i % 3}")) + + emb_dim = 33 # D+1 for hyperboloid + space_key = "test_hyp_space" + ds._storage.ensure_space( + model_id="test_hyp", + dim=emb_dim, + config={"provider": "test", "geometry": "hyperboloid", "curvature": 1.0}, + space_key=space_key, + ) + + ids = [f"s{i}" for i in range(20)] + vectors = _random_hyperboloid(20, emb_dim - 1, seed=77) + ds._storage.add_embeddings(space_key, ids, vectors) + + for geom in ("euclidean", "poincare"): + layout_key = ds.compute_visualization( + space_key=space_key, + method="pca", + geometry=geom, + ) + assert layout_key is not None + vis_ids, _, vis_coords = ds.get_visualization_data(layout_key) + assert len(vis_ids) == 20 + assert vis_coords.shape == (20, 2) + assert np.all(np.isfinite(vis_coords)) From 6a090651f03f86f0741a30dd0171901411547abc Mon Sep 17 00:00:00 2001 From: Matin Mahmood <45293386+mnm-matin@users.noreply.github.com> Date: Sun, 8 Mar 2026 12:59:31 +0100 Subject: [PATCH 2/9] Fix Devin review CI prompt --- .github/workflows/devin-review.yml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/workflows/devin-review.yml b/.github/workflows/devin-review.yml index ea1410b..cb36e41 100644 --- a/.github/workflows/devin-review.yml +++ b/.github/workflows/devin-review.yml @@ -2,11 +2,15 @@ name: Devin Review on: pull_request: - types: [opened, synchronize, reopened] + types: [opened, synchronize, reopened, ready_for_review] jobs: devin-review: runs-on: ubuntu-latest + timeout-minutes: 30 + permissions: + contents: read + pull-requests: write steps: - name: Checkout repository @@ -26,4 +30,4 @@ jobs: env: CI: true # Ensures the tool runs in non-interactive CI mode run: | - script -q -e -c "npx devin-review ${{ github.event.pull_request.html_url }}" /dev/null + script -q -e -c "printf 'y\n' | npx devin-review '${{ github.event.pull_request.html_url }}'" /dev/null From c3194db31286b110f82d5cd21b0ceca4933dbf72 Mon Sep 17 00:00:00 2001 From: Matin Mahmood <45293386+mnm-matin@users.noreply.github.com> Date: Sun, 8 Mar 2026 13:48:15 +0100 Subject: [PATCH 3/9] Switch Devin review to API workflow --- .github/workflows/devin-review.yml | 144 +++++++++++++++++++++++++---- 1 file changed, 128 insertions(+), 16 deletions(-) diff --git a/.github/workflows/devin-review.yml b/.github/workflows/devin-review.yml index cb36e41..5db6ed7 100644 --- a/.github/workflows/devin-review.yml +++ b/.github/workflows/devin-review.yml @@ -1,33 +1,145 @@ name: Devin Review on: - pull_request: + pull_request_target: types: [opened, synchronize, reopened, ready_for_review] + workflow_dispatch: + inputs: + pr_number: + description: Pull request number to review + required: true + type: string jobs: devin-review: + if: ${{ github.event_name == 'workflow_dispatch' || !github.event.pull_request.draft }} runs-on: ubuntu-latest - timeout-minutes: 30 + timeout-minutes: 10 + env: + DEVIN_API_KEY: ${{ secrets.DEVIN_API_KEY }} permissions: contents: read - pull-requests: write + pull-requests: read steps: - - name: Checkout repository - uses: actions/checkout@v4 + - name: Resolve pull request context + id: pr + uses: actions/github-script@v8 with: - fetch-depth: 0 + script: | + const prNumber = context.eventName === 'workflow_dispatch' + ? Number(core.getInput('pr_number')) + : context.payload.pull_request.number; - - name: Setup Node.js - uses: actions/setup-node@v4 - with: - node-version: '20' + if (!Number.isInteger(prNumber) || prNumber <= 0) { + core.setFailed(`Invalid pull request number: ${prNumber}`); + return; + } + + const { owner, repo } = context.repo; + const { data: pr } = await github.rest.pulls.get({ + owner, + repo, + pull_number: prNumber, + }); + const files = await github.paginate(github.rest.pulls.listFiles, { + owner, + repo, + pull_number: prNumber, + per_page: 100, + }); + + core.setOutput('number', String(prNumber)); + core.setOutput('url', pr.html_url); + core.setOutput('title', pr.title); + core.setOutput('author', pr.user.login); + core.setOutput('head_sha', pr.head.sha); + core.setOutput('head_repo', pr.head.repo.full_name); + core.setOutput('base_repo', pr.base.repo.full_name); + core.setOutput('files_json', JSON.stringify(files.map((file) => file.filename))); + + - name: Validate Devin API configuration + if: ${{ env.DEVIN_API_KEY == '' }} + run: | + echo "DEVIN_API_KEY is not configured for this repository or organization." >&2 + echo "Configure DEVIN_API_KEY before using the API-based Devin review workflow." >&2 + exit 1 - - name: Run Devin Review - # Use script to emulate a TTY as devin-review requires terminal features - # The -q flag suppresses script output, -e exits with command exit code, - # and -c runs the command. /dev/null discards script's own output. + - name: Start Devin review session + id: devin env: - CI: true # Ensures the tool runs in non-interactive CI mode + PR_NUMBER: ${{ steps.pr.outputs.number }} + PR_URL: ${{ steps.pr.outputs.url }} + PR_TITLE: ${{ steps.pr.outputs.title }} + PR_AUTHOR: ${{ steps.pr.outputs.author }} + PR_HEAD_SHA: ${{ steps.pr.outputs.head_sha }} + PR_HEAD_REPO: ${{ steps.pr.outputs.head_repo }} + PR_BASE_REPO: ${{ steps.pr.outputs.base_repo }} + PR_FILES_JSON: ${{ steps.pr.outputs.files_json }} + REPOSITORY: ${{ github.repository }} run: | - script -q -e -c "printf 'y\n' | npx devin-review '${{ github.event.pull_request.html_url }}'" /dev/null + PROMPT=$(cat <&2 + exit 1 + fi + + echo "session_id=$SESSION_ID" >> "$GITHUB_OUTPUT" + echo "session_url=$SESSION_URL" >> "$GITHUB_OUTPUT" + + { + echo "Started Devin review session." + echo + echo "- PR: #${PR_NUMBER}" + echo "- Session ID: ${SESSION_ID}" + echo "- Session URL: ${SESSION_URL}" + } >> "$GITHUB_STEP_SUMMARY" From 2f65ac19d7d7068ece5150caebc69196b63f2416 Mon Sep 17 00:00:00 2001 From: Matin Mahmood <45293386+mnm-matin@users.noreply.github.com> Date: Sun, 8 Mar 2026 13:49:41 +0100 Subject: [PATCH 4/9] Fix Devin workflow dispatch input --- .github/workflows/devin-review.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/devin-review.yml b/.github/workflows/devin-review.yml index 5db6ed7..444ce16 100644 --- a/.github/workflows/devin-review.yml +++ b/.github/workflows/devin-review.yml @@ -25,10 +25,12 @@ jobs: - name: Resolve pull request context id: pr uses: actions/github-script@v8 + env: + WORKFLOW_PR_NUMBER: ${{ inputs.pr_number }} with: script: | const prNumber = context.eventName === 'workflow_dispatch' - ? Number(core.getInput('pr_number')) + ? Number(process.env.WORKFLOW_PR_NUMBER) : context.payload.pull_request.number; if (!Number.isInteger(prNumber) || prNumber <= 0) { From 9bc386b9b5047d66f13c7f502b059addd1ed7fdc Mon Sep 17 00:00:00 2001 From: Matin Mahmood <45293386+mnm-matin@users.noreply.github.com> Date: Sun, 8 Mar 2026 13:58:54 +0100 Subject: [PATCH 5/9] Use zero-secret Devin review link --- .github/workflows/devin-review.yml | 125 +++++------------------------ 1 file changed, 22 insertions(+), 103 deletions(-) diff --git a/.github/workflows/devin-review.yml b/.github/workflows/devin-review.yml index 444ce16..66a92f1 100644 --- a/.github/workflows/devin-review.yml +++ b/.github/workflows/devin-review.yml @@ -1,7 +1,7 @@ name: Devin Review on: - pull_request_target: + pull_request: types: [opened, synchronize, reopened, ready_for_review] workflow_dispatch: inputs: @@ -12,18 +12,15 @@ on: jobs: devin-review: - if: ${{ github.event_name == 'workflow_dispatch' || !github.event.pull_request.draft }} runs-on: ubuntu-latest - timeout-minutes: 10 - env: - DEVIN_API_KEY: ${{ secrets.DEVIN_API_KEY }} + timeout-minutes: 5 permissions: contents: read pull-requests: read steps: - - name: Resolve pull request context - id: pr + - name: Resolve Devin Review URL + id: review uses: actions/github-script@v8 env: WORKFLOW_PR_NUMBER: ${{ inputs.pr_number }} @@ -39,109 +36,31 @@ jobs: } const { owner, repo } = context.repo; - const { data: pr } = await github.rest.pulls.get({ - owner, - repo, - pull_number: prNumber, - }); - const files = await github.paginate(github.rest.pulls.listFiles, { - owner, - repo, - pull_number: prNumber, - per_page: 100, - }); + const reviewUrl = `https://devinreview.com/${owner}/${repo}/pull/${prNumber}`; + const prUrl = `https://github.com/${owner}/${repo}/pull/${prNumber}`; core.setOutput('number', String(prNumber)); - core.setOutput('url', pr.html_url); - core.setOutput('title', pr.title); - core.setOutput('author', pr.user.login); - core.setOutput('head_sha', pr.head.sha); - core.setOutput('head_repo', pr.head.repo.full_name); - core.setOutput('base_repo', pr.base.repo.full_name); - core.setOutput('files_json', JSON.stringify(files.map((file) => file.filename))); + core.setOutput('pr_url', prUrl); + core.setOutput('review_url', reviewUrl); - - name: Validate Devin API configuration - if: ${{ env.DEVIN_API_KEY == '' }} + - name: Warm Devin Review page + env: + REVIEW_URL: ${{ steps.review.outputs.review_url }} run: | - echo "DEVIN_API_KEY is not configured for this repository or organization." >&2 - echo "Configure DEVIN_API_KEY before using the API-based Devin review workflow." >&2 - exit 1 + curl --fail --silent --show-error --location "$REVIEW_URL" --output /dev/null - - name: Start Devin review session - id: devin + - name: Publish Devin Review summary env: - PR_NUMBER: ${{ steps.pr.outputs.number }} - PR_URL: ${{ steps.pr.outputs.url }} - PR_TITLE: ${{ steps.pr.outputs.title }} - PR_AUTHOR: ${{ steps.pr.outputs.author }} - PR_HEAD_SHA: ${{ steps.pr.outputs.head_sha }} - PR_HEAD_REPO: ${{ steps.pr.outputs.head_repo }} - PR_BASE_REPO: ${{ steps.pr.outputs.base_repo }} - PR_FILES_JSON: ${{ steps.pr.outputs.files_json }} - REPOSITORY: ${{ github.repository }} + PR_NUMBER: ${{ steps.review.outputs.number }} + PR_URL: ${{ steps.review.outputs.pr_url }} + REVIEW_URL: ${{ steps.review.outputs.review_url }} run: | - PROMPT=$(cat <&2 - exit 1 - fi - - echo "session_id=$SESSION_ID" >> "$GITHUB_OUTPUT" - echo "session_url=$SESSION_URL" >> "$GITHUB_OUTPUT" - { - echo "Started Devin review session." + echo "Devin Review is available for PR #${PR_NUMBER}." + echo + echo "- GitHub PR: ${PR_URL}" + echo "- Devin Review: ${REVIEW_URL}" echo - echo "- PR: #${PR_NUMBER}" - echo "- Session ID: ${SESSION_ID}" - echo "- Session URL: ${SESSION_URL}" + echo "This workflow intentionally does not use DEVIN_API_KEY." + echo "For automatic Devin statuses or comments inside GitHub, connect the Devin GitHub integration and enable auto-review in Devin settings." } >> "$GITHUB_STEP_SUMMARY" From 493d90eb6aa2557870540e3185ce679aed9fd281 Mon Sep 17 00:00:00 2001 From: Matin Mahmood <45293386+mnm-matin@users.noreply.github.com> Date: Sun, 8 Mar 2026 14:20:32 +0100 Subject: [PATCH 6/9] Comment Devin review link on PR --- .github/workflows/devin-review.yml | 48 ++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/.github/workflows/devin-review.yml b/.github/workflows/devin-review.yml index 66a92f1..3c38d5e 100644 --- a/.github/workflows/devin-review.yml +++ b/.github/workflows/devin-review.yml @@ -16,6 +16,7 @@ jobs: timeout-minutes: 5 permissions: contents: read + issues: write pull-requests: read steps: @@ -64,3 +65,50 @@ jobs: echo "This workflow intentionally does not use DEVIN_API_KEY." echo "For automatic Devin statuses or comments inside GitHub, connect the Devin GitHub integration and enable auto-review in Devin settings." } >> "$GITHUB_STEP_SUMMARY" + + - name: Upsert PR comment with Devin Review link + uses: actions/github-script@v8 + env: + PR_NUMBER: ${{ steps.review.outputs.number }} + PR_URL: ${{ steps.review.outputs.pr_url }} + REVIEW_URL: ${{ steps.review.outputs.review_url }} + with: + script: | + const marker = ''; + const issue_number = Number(process.env.PR_NUMBER); + const body = [ + marker, + 'Devin Review is available for this pull request.', + '', + `- GitHub PR: ${process.env.PR_URL}`, + `- Devin Review: ${process.env.REVIEW_URL}`, + '', + 'This link opens the hosted Devin Review page for the current PR.', + ].join('\n'); + + const comments = await github.paginate(github.rest.issues.listComments, { + owner: context.repo.owner, + repo: context.repo.repo, + issue_number, + per_page: 100, + }); + + const existing = comments.find((comment) => comment.body && comment.body.includes(marker)); + + if (existing) { + await github.rest.issues.updateComment({ + owner: context.repo.owner, + repo: context.repo.repo, + comment_id: existing.id, + body, + }); + core.info(`Updated existing Devin Review comment: ${existing.html_url}`); + } else { + const created = await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number, + body, + }); + core.info(`Created Devin Review comment: ${created.data.html_url}`); + } From 8be1df11f24e7477691d8286dabe032a0df33c57 Mon Sep 17 00:00:00 2001 From: Matin Mahmood <45293386+mnm-matin@users.noreply.github.com> Date: Sun, 8 Mar 2026 14:24:41 +0100 Subject: [PATCH 7/9] Handle read-only workflow token --- .github/workflows/devin-review.yml | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/.github/workflows/devin-review.yml b/.github/workflows/devin-review.yml index 3c38d5e..4787e8d 100644 --- a/.github/workflows/devin-review.yml +++ b/.github/workflows/devin-review.yml @@ -67,6 +67,8 @@ jobs: } >> "$GITHUB_STEP_SUMMARY" - name: Upsert PR comment with Devin Review link + id: comment + continue-on-error: true uses: actions/github-script@v8 env: PR_NUMBER: ${{ steps.review.outputs.number }} @@ -112,3 +114,14 @@ jobs: }); core.info(`Created Devin Review comment: ${created.data.html_url}`); } + + - name: Report comment permission limitation + if: ${{ steps.comment.outcome == 'failure' }} + run: | + { + echo + echo "PR comment was not posted automatically." + echo + echo "This repository currently sets GitHub Actions GITHUB_TOKEN to read-only permissions, so the workflow cannot create issue comments." + echo "If repository workflow permissions are changed to read/write, this step will start posting or updating the PR comment automatically." + } >> "$GITHUB_STEP_SUMMARY" From 6e915eb6448a6a869550f495a884edbb941fc327 Mon Sep 17 00:00:00 2001 From: Matin Mahmood <45293386+mnm-matin@users.noreply.github.com> Date: Sun, 8 Mar 2026 14:26:01 +0100 Subject: [PATCH 8/9] Handle comment permission warning cleanly --- .github/workflows/devin-review.yml | 56 ++++++++++++++++++------------ 1 file changed, 33 insertions(+), 23 deletions(-) diff --git a/.github/workflows/devin-review.yml b/.github/workflows/devin-review.yml index 4787e8d..5125ed2 100644 --- a/.github/workflows/devin-review.yml +++ b/.github/workflows/devin-review.yml @@ -68,7 +68,6 @@ jobs: - name: Upsert PR comment with Devin Review link id: comment - continue-on-error: true uses: actions/github-script@v8 env: PR_NUMBER: ${{ steps.review.outputs.number }} @@ -78,6 +77,7 @@ jobs: script: | const marker = ''; const issue_number = Number(process.env.PR_NUMBER); + core.setOutput('posted', 'false'); const body = [ marker, 'Devin Review is available for this pull request.', @@ -88,35 +88,45 @@ jobs: 'This link opens the hosted Devin Review page for the current PR.', ].join('\n'); - const comments = await github.paginate(github.rest.issues.listComments, { - owner: context.repo.owner, - repo: context.repo.repo, - issue_number, - per_page: 100, - }); - - const existing = comments.find((comment) => comment.body && comment.body.includes(marker)); - - if (existing) { - await github.rest.issues.updateComment({ - owner: context.repo.owner, - repo: context.repo.repo, - comment_id: existing.id, - body, - }); - core.info(`Updated existing Devin Review comment: ${existing.html_url}`); - } else { - const created = await github.rest.issues.createComment({ + try { + const comments = await github.paginate(github.rest.issues.listComments, { owner: context.repo.owner, repo: context.repo.repo, issue_number, - body, + per_page: 100, }); - core.info(`Created Devin Review comment: ${created.data.html_url}`); + + const existing = comments.find((comment) => comment.body && comment.body.includes(marker)); + + if (existing) { + await github.rest.issues.updateComment({ + owner: context.repo.owner, + repo: context.repo.repo, + comment_id: existing.id, + body, + }); + core.info(`Updated existing Devin Review comment: ${existing.html_url}`); + } else { + const created = await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number, + body, + }); + core.info(`Created Devin Review comment: ${created.data.html_url}`); + } + + core.setOutput('posted', 'true'); + } catch (error) { + if (error && error.status === 403) { + core.warning('PR comment was not posted because this repository grants GitHub Actions a read-only GITHUB_TOKEN.'); + } else { + throw error; + } } - name: Report comment permission limitation - if: ${{ steps.comment.outcome == 'failure' }} + if: ${{ steps.comment.outputs.posted != 'true' }} run: | { echo From e493e130f742745e5d1124a1c47ab2b7489dc62b Mon Sep 17 00:00:00 2001 From: robertvava Date: Mon, 9 Mar 2026 11:55:52 +0000 Subject: [PATCH 9/9] addressing devin's comments | safeguards for minimum number of features --- src/hyperview/embeddings/pipelines.py | 7 ++-- src/hyperview/embeddings/projection.py | 10 ++++-- tests/test_pca_projection.py | 44 ++++++++++++++++++++++++++ 3 files changed, 57 insertions(+), 4 deletions(-) diff --git a/src/hyperview/embeddings/pipelines.py b/src/hyperview/embeddings/pipelines.py index a46e52a..2211469 100644 --- a/src/hyperview/embeddings/pipelines.py +++ b/src/hyperview/embeddings/pipelines.py @@ -154,8 +154,11 @@ def compute_layout( if len(ids) == 0: raise ValueError(f"No embeddings in space '{space_key}'. Call compute_embeddings() first.") - if len(ids) < 3: - raise ValueError(f"Need at least 3 samples for visualization, have {len(ids)}") + min_samples = 3 if method == "umap" else 2 + if len(ids) < min_samples: + raise ValueError( + f"Need at least {min_samples} samples for {method} visualization, have {len(ids)}" + ) if method == "umap": layout_params = { diff --git a/src/hyperview/embeddings/projection.py b/src/hyperview/embeddings/projection.py index 68892e5..39ab108 100644 --- a/src/hyperview/embeddings/projection.py +++ b/src/hyperview/embeddings/projection.py @@ -330,12 +330,18 @@ def _pca_svd( # Economy SVD (full_matrices=False) is O(N * D * min(N,D)) _U, S, Vt = np.linalg.svd(centered, full_matrices=False) - projected = centered @ Vt[:n_components].T + # SVD produces min(N, D) components; clamp to what's available + k = min(n_components, Vt.shape[0]) + projected = centered @ Vt[:k].T + if k < n_components: + pad = np.zeros((projected.shape[0], n_components - k), dtype=projected.dtype) + projected = np.hstack([projected, pad]) variance = S**2 / max(len(data) - 1, 1) total_var = variance.sum() if total_var > 0: - explained = variance[:n_components] / total_var + explained = np.zeros(n_components) + explained[:k] = variance[:k] / total_var else: explained = np.zeros(n_components) diff --git a/tests/test_pca_projection.py b/tests/test_pca_projection.py index 3581810..dfc33f4 100644 --- a/tests/test_pca_projection.py +++ b/tests/test_pca_projection.py @@ -180,6 +180,17 @@ def test_two_samples_minimal(self): coords = engine.project(data, method="pca", input_geometry="euclidean", output_geometry="euclidean") assert coords.shape == (2, 2) + def test_pca_svd_fewer_features_than_components(self): + """When D < n_components, _pca_svd should pad with zeros.""" + engine = ProjectionEngine() + data = np.array([[1.0], [2.0], [3.0]]) # (3, 1) — only 1 feature + projected, explained = engine._pca_svd(data, n_components=2) + assert projected.shape == (3, 2) + assert explained.shape == (2,) + # Second component should be zero-padded + np.testing.assert_array_equal(projected[:, 1], 0.0) + assert explained[1] == 0.0 + class TestIntegrationDatasetApi: @@ -282,3 +293,36 @@ def test_compute_visualization_pca_hyperboloid(self): assert len(vis_ids) == 20 assert vis_coords.shape == (20, 2) assert np.all(np.isfinite(vis_coords)) + + def test_two_samples_through_pipeline(self): + """2-sample PCA should work end-to-end through compute_visualization.""" + from hyperview.core.dataset import Dataset + from hyperview.core.sample import Sample + + ds = Dataset("test_pca_2samp", persist=False) + + for i in range(2): + ds.add_sample(Sample(id=f"s{i}", filepath=f"/fake/{i}.png", label=f"c{i}")) + + space_key = "test_space" + ds._storage.ensure_space( + model_id="test", + dim=16, + config={"provider": "test", "geometry": "euclidean"}, + space_key=space_key, + ) + rng = np.random.default_rng(42) + ids = [f"s{i}" for i in range(2)] + vectors = rng.standard_normal((2, 16)).astype(np.float32) + ds._storage.add_embeddings(space_key, ids, vectors) + + layout_key = ds.compute_visualization( + space_key=space_key, + method="pca", + geometry="euclidean", + ) + + assert layout_key is not None + vis_ids, _, vis_coords = ds.get_visualization_data(layout_key) + assert len(vis_ids) == 2 + assert vis_coords.shape == (2, 2)