diff --git a/README.md b/README.md index 04d869a..3bf3d69 100644 --- a/README.md +++ b/README.md @@ -28,3 +28,4 @@ Run tests: ```bash pytest tests/ -v ``` +test123 diff --git a/data/ratings.csv b/data/ratings.csv deleted file mode 100644 index ef17bca..0000000 --- a/data/ratings.csv +++ /dev/null @@ -1 +0,0 @@ -user_id,restaurant_id,rating diff --git a/data/restaurants.csv b/data/restaurants.csv deleted file mode 100644 index b230675..0000000 --- a/data/restaurants.csv +++ /dev/null @@ -1 +0,0 @@ -restaurant_id,name,cuisine,price,spice,distance_km,veg_friendly diff --git a/data/users.csv b/data/users.csv deleted file mode 100644 index f44dd20..0000000 --- a/data/users.csv +++ /dev/null @@ -1 +0,0 @@ -user_id,name,preferred_cuisine,max_price,spice_tolerance,max_distance diff --git a/src/data_loader.py b/src/data_loader.py index 1c4b306..f290029 100644 --- a/src/data_loader.py +++ b/src/data_loader.py @@ -1,2 +1,148 @@ """Data loading utilities for TasteVector.""" +""" +data_loader.py — Data Ingestion & Validation +============================================ +Person 1 | TasteVector Project + +Single point of contact with raw CSV files. +Reads restaurants.csv, users.csv, ratings.csv into clean Pandas DataFrames. +All other modules receive DataFrames or NumPy arrays — never raw CSV paths. +""" + +import os +import pandas as pd + + +# ── Expected schema ────────────────────────────────────────────────────────── + +RESTAURANT_COLS = {"restaurant_id", "name", "cuisine", "price", + "spice", "distance_km", "veg_friendly"} + +USER_COLS = {"user_id", "name", "preferred_cuisine", + "max_price", "spice_tolerance", "max_distance"} + +RATING_COLS = {"user_id", "restaurant_id", "rating"} + + +# ── Loaders ────────────────────────────────────────────────────────────────── + +def load_restaurants(data_dir: str) -> pd.DataFrame: + """ + Load and validate restaurants.csv. + + Columns: restaurant_id, name, cuisine, price (1-5), spice (1-5), + distance_km (float), veg_friendly (0 or 1) + """ + path = os.path.join(data_dir, "restaurants.csv") + df = pd.read_csv(path) + + _check_columns(df, RESTAURANT_COLS, "restaurants.csv") + + # Type coercions + df["restaurant_id"] = pd.to_numeric(df["restaurant_id"], errors="coerce") + df["price"] = pd.to_numeric(df["price"], errors="coerce") + df["spice"] = pd.to_numeric(df["spice"], errors="coerce") + df["distance_km"] = pd.to_numeric(df["distance_km"], errors="coerce") + df["veg_friendly"] = pd.to_numeric(df["veg_friendly"], errors="coerce") + + before = len(df) + df = df.dropna(subset=list(RESTAURANT_COLS)) + _warn_dropped(before, len(df), "restaurants.csv") + + df = df.reset_index(drop=True) + return df + + +def load_users(data_dir: str) -> pd.DataFrame: + """ + Load and validate users.csv. + + Columns: user_id, name, preferred_cuisine, max_price (1-5), + spice_tolerance (1-5), max_distance (float) + """ + path = os.path.join(data_dir, "users.csv") + df = pd.read_csv(path) + + _check_columns(df, USER_COLS, "users.csv") + + df["user_id"] = pd.to_numeric(df["user_id"], errors="coerce") + df["max_price"] = pd.to_numeric(df["max_price"], errors="coerce") + df["spice_tolerance"] = pd.to_numeric(df["spice_tolerance"], errors="coerce") + df["max_distance"] = pd.to_numeric(df["max_distance"], errors="coerce") + + before = len(df) + df = df.dropna(subset=list(USER_COLS)) + _warn_dropped(before, len(df), "users.csv") + + df = df.reset_index(drop=True) + return df + + +def load_ratings(data_dir: str) -> pd.DataFrame: + """ + Load and validate ratings.csv. + + Columns: user_id, restaurant_id, rating (1.0 - 5.0) + """ + path = os.path.join(data_dir, "ratings.csv") + df = pd.read_csv(path) + + _check_columns(df, RATING_COLS, "ratings.csv") + + df["user_id"] = pd.to_numeric(df["user_id"], errors="coerce") + df["restaurant_id"] = pd.to_numeric(df["restaurant_id"], errors="coerce") + df["rating"] = pd.to_numeric(df["rating"], errors="coerce") + + before = len(df) + df = df.dropna(subset=list(RATING_COLS)) + # Clamp ratings to valid range + df = df[(df["rating"] >= 1.0) & (df["rating"] <= 5.0)] + _warn_dropped(before, len(df), "ratings.csv") + + df = df.reset_index(drop=True) + return df + + +def load_all(data_dir: str) -> tuple: + """ + Load all three CSVs at once. + + Returns + ------- + (restaurants, users, ratings) as clean DataFrames + """ + restaurants = load_restaurants(data_dir) + users = load_users(data_dir) + ratings = load_ratings(data_dir) + return restaurants, users, ratings + + +# ── Helpers ────────────────────────────────────────────────────────────────── + +def _check_columns(df: pd.DataFrame, required: set, filename: str) -> None: + missing = required - set(df.columns) + if missing: + raise ValueError(f"{filename} is missing required columns: {missing}") + + +def _warn_dropped(before: int, after: int, filename: str) -> None: + dropped = before - after + if dropped > 0: + print(f"[data_loader] WARNING: dropped {dropped} malformed " + f"row(s) from {filename}") + + +# ── Smoke test ─────────────────────────────────────────────────────────────── + +if __name__ == "__main__": + import os, sys + base = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + restaurants, users, ratings = load_all(os.path.join(base, "data")) + + print(f"Restaurants : {len(restaurants)} rows") + print(restaurants.head(3).to_string(index=False)) + print(f"\nUsers : {len(users)} rows") + print(users.head(3).to_string(index=False)) + print(f"\nRatings : {len(ratings)} rows") + print(ratings.head(5).to_string(index=False)) diff --git a/src/eigen_decomp.py b/src/eigen_decomp.py index 788421a..66e78b0 100644 --- a/src/eigen_decomp.py +++ b/src/eigen_decomp.py @@ -1,2 +1,217 @@ """Eigendecomposition and diagonalization utilities.""" +""" +eigen_decomp.py — Eigendecomposition & Diagonalization +======================================================= +Person 3 | TasteVector Project + +Computes the covariance matrix C = F^T F of the restaurant feature matrix, +then finds eigenvalues and eigenvectors: + + C = P D P^{-1} (diagonalization) + +where D = diag(eigenvalues) and P = matrix of eigenvectors (columns). + +The top eigenvectors capture the feature directions along which restaurants +vary most (analogous to PCA principal components). These are used to +project user preference vectors into the most informative subspace. + +Also validates the Cayley-Hamilton theorem as a correctness check: + substituting eigenvalue λ into the characteristic polynomial p(λ) → 0 +""" + +import numpy as np + + +# ── Covariance matrix ──────────────────────────────────────────────────────── + +def covariance_matrix(F: np.ndarray) -> np.ndarray: + """ + Compute the feature covariance matrix C = F^T @ F. + + F : (n_restaurants, n_features) + C : (n_features, n_features) — symmetric, positive semi-definite + """ + return F.T @ F + + +# ── Eigendecomposition ─────────────────────────────────────────────────────── + +def eigen_decompose(C: np.ndarray) -> tuple: + """ + Compute eigenvalues and eigenvectors of a square matrix C. + + C v = λ v + + Returns + ------- + eigenvalues : (n,) — may be complex for non-symmetric matrices + (C = F^T F is always symmetric → always real) + eigenvectors : (n, n) — columns are eigenvectors + """ + eigenvalues, eigenvectors = np.linalg.eig(C) + + # sort descending by magnitude so the most important directions come first + order = np.argsort(np.abs(eigenvalues))[::-1] + return eigenvalues[order], eigenvectors[:, order] + + +# ── Diagonalization C = P D P^{-1} ───────────────────────────────────────── + +def diagonalize(C: np.ndarray) -> tuple: + """ + Diagonalize C = P D P^{-1}. + + Returns + ------- + P : (n, n) matrix whose columns are eigenvectors + D : (n, n) diagonal matrix of eigenvalues + P_inv: (n, n) inverse of P + """ + eigenvalues, P = eigen_decompose(C) + D = np.diag(eigenvalues) + P_inv = np.linalg.inv(P) + return P, D, P_inv + + +def verify_diagonalization(C: np.ndarray, P, D, P_inv, + tol: float = 1e-8) -> bool: + """ + Check P @ D @ P^{-1} ≈ C within floating-point tolerance. + Returns True if the reconstruction is accurate. + """ + C_reconstructed = P @ D @ P_inv + return bool(np.allclose(C_reconstructed, C, atol=tol)) + + +# ── Cayley-Hamilton theorem ────────────────────────────────────────────────── + +def cayley_hamilton_check(C: np.ndarray, tol: float = 1e-6) -> dict: + """ + Cayley-Hamilton: every matrix satisfies its own characteristic polynomial. + + The characteristic polynomial of C is det(C - λI) = 0. + We compute the coefficients via numpy and evaluate p(C) — if the theorem + holds, the result should be the zero matrix (within floating-point error). + + For each eigenvalue λ_i we also verify p(λ_i) ≈ 0. + + Returns + ------- + dict with keys: + 'matrix_check' : bool — p(C) ≈ 0 (Frobenius norm < tol * n^2) + 'eigenvalue_residuals' : array of |p(λ_i)| for each eigenvalue + 'all_pass' : bool + """ + n = C.shape[0] + coeffs = np.poly(C) # characteristic polynomial coefficients + + # Evaluate p(C) using Horner's method + pC = np.zeros_like(C, dtype=complex) + for c in coeffs: + pC = pC @ C + c * np.eye(n) + + matrix_frobenius = np.linalg.norm(pC) + matrix_ok = bool(matrix_frobenius < tol * n * n) + + # Evaluate p(λ) for each eigenvalue + eigenvalues, _ = np.linalg.eig(C) + residuals = np.array([abs(np.polyval(coeffs, lam)) for lam in eigenvalues]) + eigenvalue_ok = bool(np.all(residuals < tol)) + + return { + "matrix_check": matrix_ok, + "matrix_frobenius_norm": float(matrix_frobenius), + "eigenvalue_residuals": residuals, + "all_pass": matrix_ok and eigenvalue_ok, + } + + +# ── Top eigenvectors (PCA-style projection) ────────────────────────────────── + +def top_k_eigenvectors(C: np.ndarray, k: int) -> np.ndarray: + """ + Return the k eigenvectors corresponding to the k largest eigenvalues. + These span the directions of maximum variance in the feature space. + + Returns + ------- + E : (n_features, k) — columns are the top-k eigenvectors + """ + eigenvalues, eigenvectors = eigen_decompose(C) + return eigenvectors[:, :k] + + +def project_onto_top_k(v: np.ndarray, E: np.ndarray) -> np.ndarray: + """ + Project a preference vector v onto the subspace spanned by E. + + v_proj = E @ E^T @ v + + Parameters + ---------- + v : (n_features,) — user preference vector + E : (n_features, k) — top-k eigenvectors from top_k_eigenvectors() + + Returns + ------- + v_proj : (n_features,) — projection of v onto the top-k subspace + """ + return E @ (E.T @ v) + + +# ── Analysis report ────────────────────────────────────────────────────────── + +def eigen_report(F: np.ndarray, feature_names: list = None) -> None: + """ + Print a human-readable eigendecomposition report for the feature matrix F. + """ + C = covariance_matrix(F) + eigenvalues, eigenvectors = eigen_decompose(C) + P, D, P_inv = diagonalize(C) + + if feature_names is None: + feature_names = [f"feature_{i}" for i in range(F.shape[1])] + + print("=" * 55) + print("EIGENDECOMPOSITION REPORT") + print("=" * 55) + print(f"\nCovariance matrix C = F^T F shape: {C.shape}") + + total = np.abs(eigenvalues).sum() + print("\nEigenvalues (sorted descending):") + for i, (lam, vec) in enumerate(zip(eigenvalues, eigenvectors.T)): + pct = 100 * abs(lam) / total + top_feat = feature_names[np.argmax(np.abs(vec))] + print(f" λ_{i+1} = {lam.real:8.3f} ({pct:5.1f}%) " + f"dominant feature: {top_feat}") + + diag_ok = verify_diagonalization(C, P, D, P_inv) + print(f"\nDiagonalization C = P D P^{{-1}} verified: {diag_ok}") + + ch = cayley_hamilton_check(C) + print(f"Cayley-Hamilton check passed: {ch['all_pass']}") + print(f" p(C) Frobenius norm: {ch['matrix_frobenius_norm']:.2e}") + + +# ── Smoke test ─────────────────────────────────────────────────────────────── + +if __name__ == "__main__": + import os, sys + base = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + sys.path.insert(0, os.path.join(base, "src")) + from data_loader import load_all + + restaurants, _, _ = load_all(os.path.join(base, "data")) + F = restaurants[["price", "spice", "distance_km", "veg_friendly"]].to_numpy(dtype=float) + feature_names = ["price", "spice", "distance_km", "veg_friendly"] + + eigen_report(F, feature_names) + + C = covariance_matrix(F) + E = top_k_eigenvectors(C, k=2) + print(f"\nTop-2 eigenvectors shape: {E.shape}") + v = np.array([3.0, 4.0, 2.0, 1.0]) # example user preference vector + v_proj = project_onto_top_k(v, E) + print(f"User pref vector: {v}") + print(f"Projected onto k=2 subspace: {np.round(v_proj, 4)}") diff --git a/src/gaussian_elimination.py b/src/gaussian_elimination.py index 4806f84..ff92c28 100644 --- a/src/gaussian_elimination.py +++ b/src/gaussian_elimination.py @@ -1,2 +1,278 @@ """Gaussian elimination and LU decomposition routines.""" +""" +gaussian_elimination.py — LU Decomposition & Linear System Solver +================================================================== +Person 1 | TasteVector Project + +Implements LU Decomposition from scratch using only NumPy arrays. +Does NOT call scipy.linalg.lu — every step is hand-coded. + +Factorization: A = L @ U + L : lower triangular matrix (with 1s on diagonal) + U : upper triangular matrix + +Solving Ax = b in two passes: + 1. Forward substitution : solve L y = b for y + 2. Back substitution : solve U x = y for x + +Application in TasteVector +-------------------------- +Given a user's sparse preference vector b (ratings for a few restaurants) +and the feature sub-matrix A for those rated restaurants: + + A x = b → solve for x = feature weight vector + +x tells us how much the user cares about price, spice, distance, etc. +That weight vector is then dotted with every unrated restaurant's +feature vector to produce a predicted score. +""" + +import numpy as np + + +# ── LU Decomposition ───────────────────────────────────────────────────────── + +def lu_decompose(A: np.ndarray) -> tuple: + """ + Factor a square matrix A into A = L @ U using Gaussian Elimination + with partial pivoting. + + Partial pivoting: at each step, swap the current row with the row + that has the largest absolute value in the pivot column. This improves + numerical stability and avoids division by zero. + + Parameters + ---------- + A : np.ndarray, shape (n, n) — must be square and non-singular + + Returns + ------- + L : (n, n) lower triangular, diagonal entries = 1 + U : (n, n) upper triangular + P : (n, n) permutation matrix (tracks row swaps) + satisfies P @ A = L @ U + + Raises + ------ + ValueError if A is not square + np.linalg.LinAlgError if A is singular (pivot becomes 0) + """ + A = np.array(A, dtype=float) + n = A.shape[0] + + if A.ndim != 2 or A.shape[0] != A.shape[1]: + raise ValueError(f"lu_decompose requires a square matrix, got {A.shape}") + + L = np.eye(n) # start as identity; filled column by column + U = A.copy() # will be reduced to upper triangular in place + P = np.eye(n) # permutation matrix + + for col in range(n): + # ── Partial pivoting ────────────────────────────────────────────── + max_row = col + np.argmax(np.abs(U[col:, col])) + if max_row != col: + U[[col, max_row]] = U[[max_row, col]] + P[[col, max_row]] = P[[max_row, col]] + if col > 0: + L[[col, max_row], :col] = L[[max_row, col], :col] + + pivot = U[col, col] + if abs(pivot) < 1e-12: + raise np.linalg.LinAlgError( + f"Zero pivot encountered at column {col} — matrix is singular." + ) + + # ── Elimination: zero out entries below the pivot ───────────────── + for row in range(col + 1, n): + factor = U[row, col] / pivot + L[row, col] = factor + U[row, col:] -= factor * U[col, col:] + + return L, U, P + + +# ── Forward & back substitution ────────────────────────────────────────────── + +def forward_substitution(L: np.ndarray, b: np.ndarray) -> np.ndarray: + """ + Solve L y = b for y where L is lower triangular. + + y_i = (b_i - sum_{j np.ndarray: + """ + Solve U x = y for x where U is upper triangular. + + x_i = (y_i - sum_{j>i} U[i,j] * x[j]) / U[i,i] + """ + n = len(y) + x = np.zeros(n) + for i in range(n - 1, -1, -1): + x[i] = (y[i] - np.dot(U[i, i + 1:], x[i + 1:])) / U[i, i] + return x + + +# ── Combined solver ────────────────────────────────────────────────────────── + +def solve(A: np.ndarray, b: np.ndarray) -> np.ndarray: + """ + Solve the linear system A x = b using LU Decomposition. + + Steps + ----- + 1. Decompose: P A = L U + 2. Apply permutation: b' = P b + 3. Forward sub: L y = b' → y + 4. Back sub: U x = y → x + + Parameters + ---------- + A : (n, n) square non-singular matrix + b : (n,) right-hand side vector + + Returns + ------- + x : (n,) solution vector such that A @ x ≈ b + """ + b = np.array(b, dtype=float) + L, U, P = lu_decompose(A) + b_perm = P @ b # apply the same row permutations to b + y = forward_substitution(L, b_perm) + x = back_substitution(U, y) + return x + + +# ── Application: user preference weights ───────────────────────────────────── + +def solve_preference_weights(F_rated: np.ndarray, + ratings_vec: np.ndarray) -> np.ndarray: + """ + Given a user's ratings for a subset of restaurants and the feature + vectors of those restaurants, solve for the user's feature weight vector. + + System: F_rated @ x = ratings_vec + (n_rated x n_features) @ (n_features,) = (n_rated,) + + When n_rated != n_features this is over/under-determined. + We solve the Normal Equations instead: + F^T F x = F^T b (Least Squares) + + Parameters + ---------- + F_rated : (n_rated, n_features) feature matrix for rated restaurants + ratings_vec : (n_rated,) user's actual ratings + + Returns + ------- + x : (n_features,) weight vector [w_price, w_spice, w_distance, w_veg] + """ + A = F_rated.T @ F_rated # (n_features, n_features) — always square + b = F_rated.T @ ratings_vec # (n_features,) + return solve(A, b) + + +def score_restaurants(F: np.ndarray, weights: np.ndarray) -> np.ndarray: + """ + Dot every restaurant's feature vector with the user's weight vector. + + score_j = F[j] · weights + + Returns + ------- + scores : (n_restaurants,) predicted preference score per restaurant + """ + return F @ weights + + +# ── Verification helpers (used in tests) ───────────────────────────────────── + +def verify_lu(A: np.ndarray, L: np.ndarray, + U: np.ndarray, P: np.ndarray, + tol: float = 1e-8) -> bool: + """Check that P @ A ≈ L @ U within floating-point tolerance.""" + return bool(np.allclose(P @ A, L @ U, atol=tol)) + + +def residual(A: np.ndarray, x: np.ndarray, b: np.ndarray) -> float: + """||A x - b|| — should be near zero for a correct solution.""" + return float(np.linalg.norm(A @ x - b)) + + +# ── Smoke test ──────────────────────────────────────────────────────────────── + +if __name__ == "__main__": + import os, sys + base = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + sys.path.insert(0, os.path.join(base, "src")) + from data_loader import load_all + from matrix_builder import build_feature_matrix, build_rating_matrix + + print("=" * 55) + print("LU DECOMPOSITION TESTS") + print("=" * 55) + + test_systems = [ + # (A, b, label) + ( + np.array([[2., 1., -1.], [-3., -1., 2.], [-2., 1., 2.]]), + np.array([8., -11., -3.]), + "3x3 classic" + ), + ( + np.array([[1., 2., 0., 0.], [3., 4., 2., 0.], + [0., 1., 3., 1.], [0., 0., 2., 4.]]), + np.array([1., 2., 3., 4.]), + "4x4 banded" + ), + ( + np.array([[4., 3.], [6., 3.]]), + np.array([10., 12.]), + "2x2 simple" + ), + ] + + for A, b, label in test_systems: + L, U, P = lu_decompose(A) + x = solve(A, b) + lu_ok = verify_lu(A, L, U, P) + res = residual(A, x, b) + x_ref = np.linalg.solve(A, b) + print(f"\n[{label}]") + print(f" x (ours) : {np.round(x, 6)}") + print(f" x (np.linalg): {np.round(x_ref, 6)}") + print(f" LU verified : {lu_ok}") + print(f" Residual ||Ax-b||: {res:.2e}") + + print("\n" + "=" * 55) + print("PREFERENCE WEIGHT SOLVER") + print("=" * 55) + + restaurants, users, ratings = load_all(os.path.join(base, "data")) + F = build_feature_matrix(restaurants) + R = build_rating_matrix(users, restaurants, ratings) + + # User 0 (Alice) — use her rated restaurants + user_idx = 0 + rated_mask = R[user_idx] != 0 + F_rated = F[rated_mask] + ratings_vec = R[user_idx, rated_mask] + + weights = solve_preference_weights(F_rated, ratings_vec) + print(f"\nAlice's feature weights: {dict(zip(['price','spice','dist','veg'], np.round(weights,4)))}") + + scores = score_restaurants(F, weights) + top5 = np.argsort(scores)[::-1][:5] + print("\nTop-5 restaurants by weight-based score:") + for idx in top5: + name = restaurants.iloc[idx]["name"] + print(f" {name:<25} score={scores[idx]:.3f}") diff --git a/src/matrix_builder.py b/src/matrix_builder.py index 8e0f2ee..ee933fd 100644 --- a/src/matrix_builder.py +++ b/src/matrix_builder.py @@ -1,2 +1,223 @@ """Matrix construction utilities for TasteVector.""" +""" +matrix_builder.py — Rating & Feature Matrix Construction +========================================================= +Person 1 | TasteVector Project + +Builds the two core NumPy matrices used throughout the pipeline: + + R : (n_users x n_restaurants) user-restaurant rating matrix + R[i][j] = rating user i gave restaurant j + 0 means "not rated" (sparse) + + F : (n_restaurants x n_features) restaurant feature matrix + each row = [price, spice, distance_km, veg_friendly] + +Also implements: + - Matrix operations (addition, multiplication, transpose, inversion) + - Mean-centering of the rating matrix + - LU Decomposition via Gaussian Elimination ← see gaussian_elimination.py +""" + +import numpy as np +import pandas as pd + + +# ── Feature columns used to build F ───────────────────────────────────────── +FEATURE_COLS = ["price", "spice", "distance_km", "veg_friendly"] + + +# ── Rating matrix ──────────────────────────────────────────────────────────── + +def build_rating_matrix(users: pd.DataFrame, + restaurants: pd.DataFrame, + ratings: pd.DataFrame) -> np.ndarray: + """ + Construct the user-restaurant rating matrix R. + + R[i][j] = rating that user i gave restaurant j + R[i][j] = 0 if user i has not rated restaurant j (sparse / missing) + + Parameters + ---------- + users : users DataFrame (must have 'user_id' column) + restaurants : restaurants DataFrame (must have 'restaurant_id' column) + ratings : ratings DataFrame (user_id, restaurant_id, rating) + + Returns + ------- + R : np.ndarray, shape (n_users, n_restaurants), dtype float64 + """ + n_users = len(users) + n_rest = len(restaurants) + + # Build index maps: id → row/col position + uid_to_idx = {uid: i for i, uid in enumerate(users["user_id"])} + rid_to_idx = {rid: j for j, rid in enumerate(restaurants["restaurant_id"])} + + R = np.zeros((n_users, n_rest), dtype=float) + + for _, row in ratings.iterrows(): + i = uid_to_idx.get(row["user_id"]) + j = rid_to_idx.get(row["restaurant_id"]) + if i is not None and j is not None: + R[i, j] = float(row["rating"]) + + return R + + +# ── Feature matrix ──────────────────────────────────────────────────────────── + +def build_feature_matrix(restaurants: pd.DataFrame) -> np.ndarray: + """ + Construct the restaurant feature matrix F. + + Each row represents one restaurant as a numeric vector: + [price, spice, distance_km, veg_friendly] + + Parameters + ---------- + restaurants : restaurants DataFrame + + Returns + ------- + F : np.ndarray, shape (n_restaurants, n_features), dtype float64 + """ + return restaurants[FEATURE_COLS].to_numpy(dtype=float) + + +# ── Matrix operations ──────────────────────────────────────────────────────── + +def mat_add(A: np.ndarray, B: np.ndarray) -> np.ndarray: + """Element-wise matrix addition. A and B must have the same shape.""" + if A.shape != B.shape: + raise ValueError(f"Shape mismatch: {A.shape} vs {B.shape}") + return A + B + + +def mat_multiply(A: np.ndarray, B: np.ndarray) -> np.ndarray: + """ + Matrix multiplication C = A @ B. + A : (m, k) B : (k, n) → C : (m, n) + """ + if A.shape[1] != B.shape[0]: + raise ValueError( + f"Incompatible shapes for multiplication: {A.shape} @ {B.shape}" + ) + return A @ B + + +def mat_transpose(A: np.ndarray) -> np.ndarray: + """Transpose of A.""" + return A.T + + +def mat_inverse(A: np.ndarray) -> np.ndarray: + """ + Inverse of a square matrix A. + Raises np.linalg.LinAlgError if A is singular. + """ + if A.shape[0] != A.shape[1]: + raise ValueError(f"Matrix must be square to invert, got {A.shape}") + return np.linalg.inv(A) + + +# ── Mean-centering ──────────────────────────────────────────────────────────── + +def mean_center(R: np.ndarray) -> tuple: + """ + Subtract each user's mean rating from their rated entries. + + For user i: + mean_i = average of all non-zero entries in R[i] + R_c[i, j] = R[i, j] - mean_i for every j where R[i, j] != 0 + R_c[i, j] = 0 for unrated entries (unchanged) + + This corrects for users who consistently rate high or low ("bias"). + + Returns + ------- + R_centered : np.ndarray, same shape as R + user_means : np.ndarray, shape (n_users,) + """ + R_centered = R.astype(float).copy() + user_means = np.zeros(R.shape[0]) + + for i in range(R.shape[0]): + rated_mask = R[i] != 0 + if rated_mask.any(): + mu = R[i, rated_mask].mean() + user_means[i] = mu + R_centered[i, rated_mask] -= mu + + return R_centered, user_means + + +def restore_means(R_approx: np.ndarray, user_means: np.ndarray) -> np.ndarray: + """Add per-user means back (undo mean-centering).""" + return R_approx + user_means[:, np.newaxis] + + +# ── Sparsity analysis ───────────────────────────────────────────────────────── + +def sparsity_report(R: np.ndarray) -> dict: + """ + Return a dict with basic sparsity stats about R. + Used in subspace_analysis.py and tests. + """ + total = R.size + rated = int(np.count_nonzero(R)) + unrated = total - rated + return { + "shape": R.shape, + "total_cells": total, + "rated": rated, + "unrated": unrated, + "sparsity_pct": round(100.0 * unrated / total, 2), + "density_pct": round(100.0 * rated / total, 2), + } + + +# ── Smoke test ──────────────────────────────────────────────────────────────── + +if __name__ == "__main__": + import os, sys + base = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + sys.path.insert(0, os.path.join(base, "src")) + from data_loader import load_all + + restaurants, users, ratings = load_all(os.path.join(base, "data")) + + R = build_rating_matrix(users, restaurants, ratings) + F = build_feature_matrix(restaurants) + + print(f"Rating matrix R : {R.shape}") + print(f"Feature matrix F : {F.shape}") + + stats = sparsity_report(R) + print(f"\nSparsity report:") + for k, v in stats.items(): + print(f" {k:<18}: {v}") + + R_c, means = mean_center(R) + print(f"\nUser means (per-row average of rated entries):") + for i, mu in enumerate(means): + name = users.iloc[i]["name"] + print(f" {name:<10}: {mu:.3f}") + + # Verify mean-centering: mean of centered rated entries should be ~0 + for i in range(R.shape[0]): + mask = R[i] != 0 + if mask.any(): + assert abs(R_c[i, mask].mean()) < 1e-10, f"User {i} not centered!" + print("\nMean-centering verified: all per-user means ≈ 0 ✓") + + # Basic matrix ops + print("\nMatrix operation checks:") + A = np.array([[1.0, 2.0], [3.0, 4.0]]) + B = np.array([[5.0, 6.0], [7.0, 8.0]]) + print(f" A + B =\n{mat_add(A, B)}") + print(f" A @ B =\n{mat_multiply(A, B)}") + print(f" A^T =\n{mat_transpose(A)}") + print(f" A^-1 =\n{np.round(mat_inverse(A), 4)}") diff --git a/src/pagerank_ranker.py b/src/pagerank_ranker.py index b84ed86..8291def 100644 --- a/src/pagerank_ranker.py +++ b/src/pagerank_ranker.py @@ -1,2 +1,188 @@ """PageRank-style restaurant ranking utilities.""" +""" +pagerank_ranker.py — PageRank-Style Restaurant Ranking +====================================================== +Person 3 | TasteVector Project + +Builds a restaurant-to-restaurant similarity graph where: + - Each node = a restaurant + - Edge weight = cosine similarity between the two restaurants' feature vectors + +The graph is represented as an (n_restaurants x n_restaurants) matrix G. + +A PageRank-style score is derived from the dominant eigenvector of G +(the eigenvector corresponding to the largest eigenvalue). + +Restaurants that are highly similar to many other well-regarded restaurants +receive a high PageRank score. This global importance signal is combined +with the SVD collaborative-filtering score in recommender.py. +""" + +import numpy as np + + +# ── Cosine similarity helpers ──────────────────────────────────────────────── + +def cosine_similarity(u: np.ndarray, v: np.ndarray) -> float: + """ + sim(u, v) = (u · v) / (||u|| * ||v||) + Returns 0.0 if either vector is the zero vector. + """ + norm_u = np.linalg.norm(u) + norm_v = np.linalg.norm(v) + if norm_u == 0.0 or norm_v == 0.0: + return 0.0 + return float(np.dot(u, v) / (norm_u * norm_v)) + + +# ── Similarity graph construction ──────────────────────────────────────────── + +def build_similarity_graph(F: np.ndarray) -> np.ndarray: + """ + Build a restaurant-to-restaurant similarity graph G. + + G[i, j] = cosine_similarity(F[i], F[j]) + + Parameters + ---------- + F : (n_restaurants, n_features) + + Returns + ------- + G : (n_restaurants, n_restaurants) symmetric, values in [0, 1] + diagonal entries set to 0 (a restaurant is not similar to itself + for ranking purposes) + """ + n = F.shape[0] + G = np.zeros((n, n)) + for i in range(n): + for j in range(i + 1, n): + sim = cosine_similarity(F[i], F[j]) + G[i, j] = sim + G[j, i] = sim # symmetric + return G + + +def normalize_graph(G: np.ndarray) -> np.ndarray: + """ + Row-normalize G so each row sums to 1 (stochastic matrix). + Rows that are all-zero are left as zero (isolated nodes). + This is analogous to the PageRank transition matrix. + """ + row_sums = G.sum(axis=1, keepdims=True) + row_sums[row_sums == 0] = 1 # avoid division by zero + return G / row_sums + + +# ── PageRank via dominant eigenvector ──────────────────────────────────────── + +def dominant_eigenvector(M: np.ndarray) -> np.ndarray: + """ + Find the dominant eigenvector of matrix M using numpy.linalg.eig. + + The dominant eigenvector corresponds to the eigenvalue with the + largest absolute value. For a non-negative stochastic matrix this + is the Perron-Frobenius eigenvector (all non-negative entries). + + Returns + ------- + v : (n,) real-valued dominant eigenvector, L1-normalized so scores + sum to 1 and are interpretable as probabilities. + """ + eigenvalues, eigenvectors = np.linalg.eig(M) + idx = np.argmax(np.abs(eigenvalues)) + v = eigenvectors[:, idx].real # take real part (tiny imaginary residuals) + v = np.abs(v) # Perron-Frobenius: entries should be ≥ 0 + v /= v.sum() if v.sum() != 0 else 1 # L1-normalize → interpretable scores + return v + + +def pagerank_scores(F: np.ndarray) -> np.ndarray: + """ + Full pipeline: build graph → normalize → dominant eigenvector. + + Parameters + ---------- + F : (n_restaurants, n_features) + + Returns + ------- + scores : (n_restaurants,) PageRank importance score per restaurant, + normalized to sum to 1. + """ + G = build_similarity_graph(F) + G_norm = normalize_graph(G) + return dominant_eigenvector(G_norm) + + +# ── Power iteration (alternative / validation) ─────────────────────────────── + +def power_iteration(M: np.ndarray, n_iter: int = 100, + tol: float = 1e-10) -> np.ndarray: + """ + Compute the dominant eigenvector via power iteration. + More numerically stable for large matrices; useful for validating + the numpy.linalg.eig result. + + Starts from a uniform vector and repeatedly multiplies by M^T, + converging to the stationary distribution of the Markov chain. + """ + n = M.shape[0] + v = np.ones(n) / n # uniform start + for _ in range(n_iter): + v_new = M.T @ v + norm = np.linalg.norm(v_new) + if norm == 0: + break + v_new /= norm + if np.linalg.norm(v_new - v) < tol: + break + v = v_new + v = np.abs(v) + v /= v.sum() if v.sum() != 0 else 1 + return v + + +# ── Ranked restaurant list ──────────────────────────────────────────────────── + +def rank_restaurants(scores: np.ndarray) -> list: + """ + Return restaurant indices sorted by PageRank score (descending). + + Returns + ------- + List of (restaurant_idx, score) tuples + """ + order = np.argsort(scores)[::-1] + return [(int(i), float(scores[i])) for i in order] + + +# ── Smoke test ─────────────────────────────────────────────────────────────── + +if __name__ == "__main__": + import os, sys + base = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + sys.path.insert(0, os.path.join(base, "src")) + from data_loader import load_all + + restaurants, _, _ = load_all(os.path.join(base, "data")) + F = restaurants[["price", "spice", "distance_km", "veg_friendly"]].to_numpy(dtype=float) + + G = build_similarity_graph(F) + print(f"Similarity graph shape: {G.shape}") + print(f"Mean edge weight: {G[G > 0].mean():.4f}") + + scores_eig = pagerank_scores(F) + G_norm = normalize_graph(G) + scores_pow = power_iteration(G_norm) + + print(f"\nCorrelation (eig vs power iter): " + f"{np.corrcoef(scores_eig, scores_pow)[0,1]:.6f}") + + print("\nPageRank Restaurant Ranking:") + print(f"{'Rank':<6}{'Name':<25}{'Score':>8}") + print("-" * 40) + for rank, (idx, score) in enumerate(rank_restaurants(scores_eig), 1): + name = restaurants.iloc[idx]["name"] + print(f"{rank:<6}{name:<25}{score:>8.5f}") diff --git a/src/recommender.py b/src/recommender.py index 4333e86..6eb4516 100644 --- a/src/recommender.py +++ b/src/recommender.py @@ -1,2 +1,302 @@ """Main TasteVector recommendation pipeline.""" +""" +recommender.py — Main Recommendation Pipeline +============================================= +Person 3 | TasteVector Project + +Orchestrates the full recommendation pipeline: + + Existing user → SVD collaborative filtering + PageRank blending + New user → Least Squares projection (cold-start) + PageRank blending + +The final score is a weighted combination: + final_score = α * svd_score + β * pagerank_score + γ * projection_score + +Filters are applied post-scoring to remove restaurants that exceed the +user's max_price or max_distance constraints. +""" + +import numpy as np +import pandas as pd +import os, sys + +# Allow running this file directly from the src/ directory +_src = os.path.dirname(os.path.abspath(__file__)) +if _src not in sys.path: + sys.path.insert(0, _src) + +from svd_recommender import predict_ratings, top_n_for_user, mean_center, decompose, recommended_k +from eigen_decomp import covariance_matrix, top_k_eigenvectors, project_onto_top_k +from pagerank_ranker import pagerank_scores, build_similarity_graph, normalize_graph + + +# ── Weights for score blending ─────────────────────────────────────────────── +ALPHA = 0.60 # SVD collaborative filtering weight +BETA = 0.25 # PageRank global importance weight +GAMMA = 0.15 # Cold-start projection weight (only for new users) + + +# ── Feature matrix builder (inline; avoids circular imports) ───────────────── + +def _build_feature_matrix(restaurants: pd.DataFrame) -> np.ndarray: + cols = ["price", "spice", "distance_km", "veg_friendly"] + return restaurants[cols].to_numpy(dtype=float) + + +# ── Constraint filtering ───────────────────────────────────────────────────── + +def apply_constraints(restaurants: pd.DataFrame, scores: np.ndarray, + max_price: int = None, + max_distance: float = None) -> np.ndarray: + """ + Set scores to -inf for restaurants that violate hard constraints. + This ensures they never appear in the top-N list. + """ + filtered = scores.copy() + if max_price is not None: + mask = restaurants["price"].to_numpy() > max_price + filtered[mask] = -np.inf + if max_distance is not None: + mask = restaurants["distance_km"].to_numpy() > max_distance + filtered[mask] = -np.inf + return filtered + + +# ── Existing-user recommendation ───────────────────────────────────────────── + +def recommend_for_existing_user( + user_idx: int, + R: np.ndarray, + restaurants: pd.DataFrame, + n: int = 5, + k: int = None, + max_price: int = None, + max_distance: float = None, +) -> list: + """ + Recommendation pipeline for a user with rating history. + + Parameters + ---------- + user_idx : row index in R + R : (n_users, n_restaurants) raw rating matrix + restaurants : DataFrame with restaurant metadata + n : number of results to return + k : SVD latent dimensions (auto-selected if None) + max_price : hard constraint on price (1-5) + max_distance : hard constraint on distance (km) + + Returns + ------- + List of dicts, each with restaurant metadata + scores + """ + F = _build_feature_matrix(restaurants) + + # ── SVD score ────────────────────────────────────────────────────────── + R_centered, user_means = mean_center(R) + _, sigma, _ = decompose(R_centered) + if k is None: + k = recommended_k(sigma) + R_pred = predict_ratings(R, k=k) + svd_scores = R_pred[user_idx] + + # Normalize SVD scores to [0, 1] + svd_min, svd_max = svd_scores.min(), svd_scores.max() + if svd_max > svd_min: + svd_norm = (svd_scores - svd_min) / (svd_max - svd_min) + else: + svd_norm = np.zeros_like(svd_scores) + + # ── PageRank score ───────────────────────────────────────────────────── + pr_scores = pagerank_scores(F) # already normalized to sum=1 + + # ── Blend ────────────────────────────────────────────────────────────── + final = ALPHA * svd_norm + BETA * pr_scores + + # ── Apply constraints ────────────────────────────────────────────────── + final = apply_constraints(restaurants, final, max_price, max_distance) + + # ── Exclude already-rated restaurants ────────────────────────────────── + final[R[user_idx] != 0] = -np.inf + + # ── Collect top-N ────────────────────────────────────────────────────── + top_idx = np.argsort(final)[::-1][:n] + return _format_results(top_idx, final, svd_norm, pr_scores, restaurants) + + +# ── New-user recommendation (cold-start) ───────────────────────────────────── + +def recommend_for_new_user( + preferences: np.ndarray, + R: np.ndarray, + restaurants: pd.DataFrame, + n: int = 5, + max_price: int = None, + max_distance: float = None, +) -> list: + """ + Cold-start recommendation: user has no rating history. + + Strategy + -------- + 1. Project the user preference vector onto the top-k eigenvectors of + the restaurant feature covariance matrix (Gram-Schmidt / Least Squares). + 2. Compute cosine similarity between the projected vector and each restaurant. + 3. Blend with PageRank global importance score. + + Parameters + ---------- + preferences : (n_features,) user's stated preferences + [price, spice, distance_km, veg_friendly] + """ + F = _build_feature_matrix(restaurants) + n_rest = F.shape[0] + + # ── Eigendecomposition of feature covariance ─────────────────────────── + C = covariance_matrix(F) + k = min(3, C.shape[0]) + E = top_k_eigenvectors(C, k=k) + + # ── Project user preference vector ───────────────────────────────────── + v_proj = project_onto_top_k(preferences, E) + + # ── Cosine similarity of projected vector vs each restaurant ─────────── + proj_scores = np.array([ + _cosine(v_proj, F[j]) for j in range(n_rest) + ]) + + # ── PageRank score ───────────────────────────────────────────────────── + pr_scores = pagerank_scores(F) + + # ── Blend ────────────────────────────────────────────────────────────── + # For a new user there are no SVD scores, so GAMMA replaces ALPHA + final = (ALPHA + GAMMA) * proj_scores + BETA * pr_scores + + # ── Apply constraints ────────────────────────────────────────────────── + final = apply_constraints(restaurants, final, max_price, max_distance) + + top_idx = np.argsort(final)[::-1][:n] + return _format_results(top_idx, final, proj_scores, pr_scores, restaurants) + + +def _cosine(u: np.ndarray, v: np.ndarray) -> float: + nu, nv = np.linalg.norm(u), np.linalg.norm(v) + if nu == 0 or nv == 0: + return 0.0 + return float(np.dot(u, v) / (nu * nv)) + + +# ── Result formatter ───────────────────────────────────────────────────────── + +def _format_results(top_idx, final_scores, content_scores, + pr_scores, restaurants) -> list: + results = [] + for rank, idx in enumerate(top_idx, 1): + if final_scores[idx] == -np.inf: + continue + row = restaurants.iloc[idx] + results.append({ + "rank": rank, + "restaurant_id": int(row["restaurant_id"]), + "name": row["name"], + "cuisine": row["cuisine"], + "price": int(row["price"]), + "spice": int(row["spice"]), + "distance_km": float(row["distance_km"]), + "veg_friendly": bool(row["veg_friendly"]), + "final_score": round(float(final_scores[idx]), 4), + "content_score": round(float(content_scores[idx]), 4), + "pagerank_score": round(float(pr_scores[idx]), 4), + }) + return results + + +# ── Public entry point (called by api.py) ──────────────────────────────────── + +def get_recommendations( + user_id: int | None, + preferences: dict, + R: np.ndarray, + users: pd.DataFrame, + restaurants: pd.DataFrame, + top_n: int = 5, +) -> list: + """ + Main entry point. Called by the Flask API. + + Parameters + ---------- + user_id : existing user ID, or None for a new/anonymous user + preferences : dict with keys: max_price, spice_tolerance, max_distance + (cuisine is used for filtering in the API layer, not here) + R : rating matrix built by matrix_builder.py + users : users DataFrame + restaurants : restaurants DataFrame + top_n : number of recommendations + + Returns + ------- + List of recommendation dicts (see _format_results) + """ + max_price = preferences.get("max_price") + max_distance = preferences.get("max_distance") + spice = preferences.get("spice_tolerance", 3) + + if user_id is not None: + # Map user_id → row index + uid_list = users["user_id"].tolist() + if user_id in uid_list: + user_idx = uid_list.index(user_id) + return recommend_for_existing_user( + user_idx, R, restaurants, n=top_n, + max_price=max_price, max_distance=max_distance, + ) + + # New / anonymous user — use preference vector + price_pref = max_price if max_price else 3 + dist_pref = max_distance if max_distance else 3.0 + pref_vec = np.array([price_pref, spice, dist_pref, 1.0], dtype=float) + + return recommend_for_new_user( + pref_vec, R, restaurants, n=top_n, + max_price=max_price, max_distance=max_distance, + ) + + +# ── Smoke test ─────────────────────────────────────────────────────────────── + +if __name__ == "__main__": + base = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + sys.path.insert(0, os.path.join(base, "src")) + from data_loader import load_all + from matrix_builder import build_rating_matrix + + restaurants, users, ratings = load_all(os.path.join(base, "data")) + R = build_rating_matrix(users, restaurants, ratings) + + print("=" * 55) + print("EXISTING USER — Alice (user_id=1)") + print("=" * 55) + recs = get_recommendations( + user_id=1, + preferences={"max_price": 4, "max_distance": 4.0, "spice_tolerance": 5}, + R=R, users=users, restaurants=restaurants, top_n=5, + ) + for r in recs: + print(f" #{r['rank']} {r['name']:<25} " + f"score={r['final_score']:.4f} " + f"(svd={r['content_score']:.4f}, pr={r['pagerank_score']:.4f})") + + print("\n" + "=" * 55) + print("NEW USER — no history, prefers cheap + spicy + nearby") + print("=" * 55) + recs_new = get_recommendations( + user_id=None, + preferences={"max_price": 2, "max_distance": 2.0, "spice_tolerance": 5}, + R=R, users=users, restaurants=restaurants, top_n=5, + ) + for r in recs_new: + print(f" #{r['rank']} {r['name']:<25} " + f"score={r['final_score']:.4f} " + f"(proj={r['content_score']:.4f}, pr={r['pagerank_score']:.4f})") diff --git a/src/svd_recommender.py b/src/svd_recommender.py index bb6e8d6..b1c2234 100644 --- a/src/svd_recommender.py +++ b/src/svd_recommender.py @@ -1,2 +1,179 @@ """SVD-based collaborative filtering routines.""" +""" +svd_recommender.py — SVD-Based Collaborative Filtering +======================================================= +Person 3 | TasteVector Project + +Decomposes the mean-centered rating matrix R using SVD: + R = U * Sigma * V^T + +Truncated SVD (top-k singular values) reduces noise and fills in +predicted scores for unrated user-restaurant pairs. +""" + +import numpy as np + + +# ── Mean-centering ────────────────────────────────────────────────────────── + +def mean_center(R: np.ndarray) -> tuple: + """ + Subtract each user's mean rating (rated entries only) from their row. + + Parameters + ---------- + R : (n_users, n_restaurants) 0 = unrated + + Returns + ------- + R_centered : mean-centered matrix + user_means : per-user mean, shape (n_users,) + """ + R_centered = R.astype(float).copy() + user_means = np.zeros(R.shape[0]) + for i in range(R.shape[0]): + mask = R[i] != 0 + if mask.any(): + user_means[i] = R[i, mask].mean() + R_centered[i, mask] -= user_means[i] + return R_centered, user_means + + +def restore_means(R_approx: np.ndarray, user_means: np.ndarray) -> np.ndarray: + """Add per-user means back to an approximated matrix.""" + return R_approx + user_means[:, np.newaxis] + + +# ── SVD decomposition ─────────────────────────────────────────────────────── + +def decompose(R_centered: np.ndarray) -> tuple: + """ + Full SVD: R_centered = U @ diag(sigma) @ Vt + + Returns + ------- + U : (n_users, n_users) + sigma : singular values in descending order, shape (min(m,n),) + Vt : (n_restaurants, n_restaurants) + """ + U, sigma, Vt = np.linalg.svd(R_centered, full_matrices=True) + return U, sigma, Vt + + +def truncate(U, sigma, Vt, k: int) -> tuple: + """ + Keep only the top-k singular values/vectors (noise reduction). + + Returns + ------- + U_k : (n_users, k) + s_k : (k,) + Vt_k : (k, n_restaurants) + """ + k = min(k, len(sigma)) + return U[:, :k], sigma[:k], Vt[:k, :] + + +def reconstruct(U_k, s_k, Vt_k) -> np.ndarray: + """R_approx = U_k @ diag(s_k) @ Vt_k""" + return U_k @ np.diag(s_k) @ Vt_k + + +# ── Variance explained (helps choose k) ───────────────────────────────────── + +def variance_explained(sigma: np.ndarray) -> np.ndarray: + """Cumulative fraction of variance explained by each singular value.""" + sq = sigma ** 2 + return np.cumsum(sq) / sq.sum() + + +def recommended_k(sigma: np.ndarray, threshold: float = 0.80) -> int: + """Smallest k that explains at least `threshold` of total variance.""" + cumvar = variance_explained(sigma) + hits = np.where(cumvar >= threshold)[0] + return int(hits[0]) + 1 if len(hits) > 0 else len(sigma) + + +# ── High-level prediction pipeline ────────────────────────────────────────── + +def predict_ratings(R: np.ndarray, k: int = 4) -> np.ndarray: + """ + Full pipeline: mean-center → SVD → truncate → reconstruct → restore means. + + Returns + ------- + R_pred : (n_users, n_restaurants) predicted score for every cell + """ + R_centered, user_means = mean_center(R) + U, sigma, Vt = decompose(R_centered) + U_k, s_k, Vt_k = truncate(U, sigma, Vt, k) + R_approx = reconstruct(U_k, s_k, Vt_k) + return restore_means(R_approx, user_means) + + +def top_n_for_user(user_idx: int, R: np.ndarray, R_pred: np.ndarray, + n: int = 5, only_unrated: bool = True) -> list: + """ + Top-N restaurant recommendations for a single user. + + Parameters + ---------- + user_idx : row index of the target user + R : original rating matrix (to identify already-rated items) + R_pred : predicted rating matrix + only_unrated: exclude restaurants the user has already rated + + Returns + ------- + List of (restaurant_idx, predicted_score) sorted descending by score + """ + scores = R_pred[user_idx].copy() + if only_unrated: + scores[R[user_idx] != 0] = -np.inf + top_idx = np.argsort(scores)[::-1][:n] + return [(int(i), float(scores[i])) for i in top_idx] + + +# ── Evaluation ─────────────────────────────────────────────────────────────── + +def frobenius_error(R_original: np.ndarray, R_approx: np.ndarray) -> float: + """ + Frobenius norm of residuals on rated entries only. + Used in unit tests to verify SVD beats a random baseline. + """ + mask = R_original != 0 + diff = (R_original - R_approx)[mask] + return float(np.sqrt((diff ** 2).sum())) + + +# ── Smoke test ─────────────────────────────────────────────────────────────── + +if __name__ == "__main__": + import os, sys + base = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + sys.path.insert(0, os.path.join(base, "src")) + from data_loader import load_all + from matrix_builder import build_rating_matrix + + restaurants, users, ratings = load_all(os.path.join(base, "data")) + R = build_rating_matrix(users, restaurants, ratings) + + print(f"Rating matrix: {R.shape} | " + f"Sparsity: {100*np.count_nonzero(R)/R.size:.1f}% rated") + + _, sigma, _ = decompose(mean_center(R)[0]) + print("\nSingular values:", np.round(sigma[:8], 3)) + + k = recommended_k(sigma) + print(f"Recommended k (80% variance): {k}") + for i, v in enumerate(variance_explained(sigma)[:8]): + print(f" k={i+1}: {v*100:.1f}% variance") + + R_pred = predict_ratings(R, k=k) + print("\nTop-5 recs for user 0 (Alice):") + for rest_idx, score in top_n_for_user(0, R, R_pred, n=5): + name = restaurants.iloc[rest_idx]["name"] + print(f" [{rest_idx+1}] {name:<25} predicted={score:.3f}") + + print(f"\nFrobenius error (rated entries): {frobenius_error(R, R_pred):.4f}")