From 92a1e85048d4f14df76e466016a51fbe6688ca3a Mon Sep 17 00:00:00 2001 From: meghanagrandhi Date: Sun, 12 Apr 2026 18:35:41 +0530 Subject: [PATCH 01/25] Implement data loading and validation for CSV files This module provides data loading utilities for the TasteVector project, including functions to load and validate CSV files for restaurants, users, and ratings. It ensures that the data adheres to expected schemas and performs necessary type coercions. --- src/data_loader.py | 146 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 146 insertions(+) diff --git a/src/data_loader.py b/src/data_loader.py index 1c4b306..f290029 100644 --- a/src/data_loader.py +++ b/src/data_loader.py @@ -1,2 +1,148 @@ """Data loading utilities for TasteVector.""" +""" +data_loader.py — Data Ingestion & Validation +============================================ +Person 1 | TasteVector Project + +Single point of contact with raw CSV files. +Reads restaurants.csv, users.csv, ratings.csv into clean Pandas DataFrames. +All other modules receive DataFrames or NumPy arrays — never raw CSV paths. +""" + +import os +import pandas as pd + + +# ── Expected schema ────────────────────────────────────────────────────────── + +RESTAURANT_COLS = {"restaurant_id", "name", "cuisine", "price", + "spice", "distance_km", "veg_friendly"} + +USER_COLS = {"user_id", "name", "preferred_cuisine", + "max_price", "spice_tolerance", "max_distance"} + +RATING_COLS = {"user_id", "restaurant_id", "rating"} + + +# ── Loaders ────────────────────────────────────────────────────────────────── + +def load_restaurants(data_dir: str) -> pd.DataFrame: + """ + Load and validate restaurants.csv. + + Columns: restaurant_id, name, cuisine, price (1-5), spice (1-5), + distance_km (float), veg_friendly (0 or 1) + """ + path = os.path.join(data_dir, "restaurants.csv") + df = pd.read_csv(path) + + _check_columns(df, RESTAURANT_COLS, "restaurants.csv") + + # Type coercions + df["restaurant_id"] = pd.to_numeric(df["restaurant_id"], errors="coerce") + df["price"] = pd.to_numeric(df["price"], errors="coerce") + df["spice"] = pd.to_numeric(df["spice"], errors="coerce") + df["distance_km"] = pd.to_numeric(df["distance_km"], errors="coerce") + df["veg_friendly"] = pd.to_numeric(df["veg_friendly"], errors="coerce") + + before = len(df) + df = df.dropna(subset=list(RESTAURANT_COLS)) + _warn_dropped(before, len(df), "restaurants.csv") + + df = df.reset_index(drop=True) + return df + + +def load_users(data_dir: str) -> pd.DataFrame: + """ + Load and validate users.csv. + + Columns: user_id, name, preferred_cuisine, max_price (1-5), + spice_tolerance (1-5), max_distance (float) + """ + path = os.path.join(data_dir, "users.csv") + df = pd.read_csv(path) + + _check_columns(df, USER_COLS, "users.csv") + + df["user_id"] = pd.to_numeric(df["user_id"], errors="coerce") + df["max_price"] = pd.to_numeric(df["max_price"], errors="coerce") + df["spice_tolerance"] = pd.to_numeric(df["spice_tolerance"], errors="coerce") + df["max_distance"] = pd.to_numeric(df["max_distance"], errors="coerce") + + before = len(df) + df = df.dropna(subset=list(USER_COLS)) + _warn_dropped(before, len(df), "users.csv") + + df = df.reset_index(drop=True) + return df + + +def load_ratings(data_dir: str) -> pd.DataFrame: + """ + Load and validate ratings.csv. + + Columns: user_id, restaurant_id, rating (1.0 - 5.0) + """ + path = os.path.join(data_dir, "ratings.csv") + df = pd.read_csv(path) + + _check_columns(df, RATING_COLS, "ratings.csv") + + df["user_id"] = pd.to_numeric(df["user_id"], errors="coerce") + df["restaurant_id"] = pd.to_numeric(df["restaurant_id"], errors="coerce") + df["rating"] = pd.to_numeric(df["rating"], errors="coerce") + + before = len(df) + df = df.dropna(subset=list(RATING_COLS)) + # Clamp ratings to valid range + df = df[(df["rating"] >= 1.0) & (df["rating"] <= 5.0)] + _warn_dropped(before, len(df), "ratings.csv") + + df = df.reset_index(drop=True) + return df + + +def load_all(data_dir: str) -> tuple: + """ + Load all three CSVs at once. + + Returns + ------- + (restaurants, users, ratings) as clean DataFrames + """ + restaurants = load_restaurants(data_dir) + users = load_users(data_dir) + ratings = load_ratings(data_dir) + return restaurants, users, ratings + + +# ── Helpers ────────────────────────────────────────────────────────────────── + +def _check_columns(df: pd.DataFrame, required: set, filename: str) -> None: + missing = required - set(df.columns) + if missing: + raise ValueError(f"{filename} is missing required columns: {missing}") + + +def _warn_dropped(before: int, after: int, filename: str) -> None: + dropped = before - after + if dropped > 0: + print(f"[data_loader] WARNING: dropped {dropped} malformed " + f"row(s) from {filename}") + + +# ── Smoke test ─────────────────────────────────────────────────────────────── + +if __name__ == "__main__": + import os, sys + base = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + restaurants, users, ratings = load_all(os.path.join(base, "data")) + + print(f"Restaurants : {len(restaurants)} rows") + print(restaurants.head(3).to_string(index=False)) + print(f"\nUsers : {len(users)} rows") + print(users.head(3).to_string(index=False)) + print(f"\nRatings : {len(ratings)} rows") + print(ratings.head(5).to_string(index=False)) From 3736face248a7982a5f3099a40e3cd2e90c59ae4 Mon Sep 17 00:00:00 2001 From: meghanagrandhi Date: Sun, 12 Apr 2026 18:36:17 +0530 Subject: [PATCH 02/25] Add matrix construction utilities for TasteVector --- src/matrix_builder.py | 221 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 221 insertions(+) diff --git a/src/matrix_builder.py b/src/matrix_builder.py index 8e0f2ee..ee933fd 100644 --- a/src/matrix_builder.py +++ b/src/matrix_builder.py @@ -1,2 +1,223 @@ """Matrix construction utilities for TasteVector.""" +""" +matrix_builder.py — Rating & Feature Matrix Construction +========================================================= +Person 1 | TasteVector Project + +Builds the two core NumPy matrices used throughout the pipeline: + + R : (n_users x n_restaurants) user-restaurant rating matrix + R[i][j] = rating user i gave restaurant j + 0 means "not rated" (sparse) + + F : (n_restaurants x n_features) restaurant feature matrix + each row = [price, spice, distance_km, veg_friendly] + +Also implements: + - Matrix operations (addition, multiplication, transpose, inversion) + - Mean-centering of the rating matrix + - LU Decomposition via Gaussian Elimination ← see gaussian_elimination.py +""" + +import numpy as np +import pandas as pd + + +# ── Feature columns used to build F ───────────────────────────────────────── +FEATURE_COLS = ["price", "spice", "distance_km", "veg_friendly"] + + +# ── Rating matrix ──────────────────────────────────────────────────────────── + +def build_rating_matrix(users: pd.DataFrame, + restaurants: pd.DataFrame, + ratings: pd.DataFrame) -> np.ndarray: + """ + Construct the user-restaurant rating matrix R. + + R[i][j] = rating that user i gave restaurant j + R[i][j] = 0 if user i has not rated restaurant j (sparse / missing) + + Parameters + ---------- + users : users DataFrame (must have 'user_id' column) + restaurants : restaurants DataFrame (must have 'restaurant_id' column) + ratings : ratings DataFrame (user_id, restaurant_id, rating) + + Returns + ------- + R : np.ndarray, shape (n_users, n_restaurants), dtype float64 + """ + n_users = len(users) + n_rest = len(restaurants) + + # Build index maps: id → row/col position + uid_to_idx = {uid: i for i, uid in enumerate(users["user_id"])} + rid_to_idx = {rid: j for j, rid in enumerate(restaurants["restaurant_id"])} + + R = np.zeros((n_users, n_rest), dtype=float) + + for _, row in ratings.iterrows(): + i = uid_to_idx.get(row["user_id"]) + j = rid_to_idx.get(row["restaurant_id"]) + if i is not None and j is not None: + R[i, j] = float(row["rating"]) + + return R + + +# ── Feature matrix ──────────────────────────────────────────────────────────── + +def build_feature_matrix(restaurants: pd.DataFrame) -> np.ndarray: + """ + Construct the restaurant feature matrix F. + + Each row represents one restaurant as a numeric vector: + [price, spice, distance_km, veg_friendly] + + Parameters + ---------- + restaurants : restaurants DataFrame + + Returns + ------- + F : np.ndarray, shape (n_restaurants, n_features), dtype float64 + """ + return restaurants[FEATURE_COLS].to_numpy(dtype=float) + + +# ── Matrix operations ──────────────────────────────────────────────────────── + +def mat_add(A: np.ndarray, B: np.ndarray) -> np.ndarray: + """Element-wise matrix addition. A and B must have the same shape.""" + if A.shape != B.shape: + raise ValueError(f"Shape mismatch: {A.shape} vs {B.shape}") + return A + B + + +def mat_multiply(A: np.ndarray, B: np.ndarray) -> np.ndarray: + """ + Matrix multiplication C = A @ B. + A : (m, k) B : (k, n) → C : (m, n) + """ + if A.shape[1] != B.shape[0]: + raise ValueError( + f"Incompatible shapes for multiplication: {A.shape} @ {B.shape}" + ) + return A @ B + + +def mat_transpose(A: np.ndarray) -> np.ndarray: + """Transpose of A.""" + return A.T + + +def mat_inverse(A: np.ndarray) -> np.ndarray: + """ + Inverse of a square matrix A. + Raises np.linalg.LinAlgError if A is singular. + """ + if A.shape[0] != A.shape[1]: + raise ValueError(f"Matrix must be square to invert, got {A.shape}") + return np.linalg.inv(A) + + +# ── Mean-centering ──────────────────────────────────────────────────────────── + +def mean_center(R: np.ndarray) -> tuple: + """ + Subtract each user's mean rating from their rated entries. + + For user i: + mean_i = average of all non-zero entries in R[i] + R_c[i, j] = R[i, j] - mean_i for every j where R[i, j] != 0 + R_c[i, j] = 0 for unrated entries (unchanged) + + This corrects for users who consistently rate high or low ("bias"). + + Returns + ------- + R_centered : np.ndarray, same shape as R + user_means : np.ndarray, shape (n_users,) + """ + R_centered = R.astype(float).copy() + user_means = np.zeros(R.shape[0]) + + for i in range(R.shape[0]): + rated_mask = R[i] != 0 + if rated_mask.any(): + mu = R[i, rated_mask].mean() + user_means[i] = mu + R_centered[i, rated_mask] -= mu + + return R_centered, user_means + + +def restore_means(R_approx: np.ndarray, user_means: np.ndarray) -> np.ndarray: + """Add per-user means back (undo mean-centering).""" + return R_approx + user_means[:, np.newaxis] + + +# ── Sparsity analysis ───────────────────────────────────────────────────────── + +def sparsity_report(R: np.ndarray) -> dict: + """ + Return a dict with basic sparsity stats about R. + Used in subspace_analysis.py and tests. + """ + total = R.size + rated = int(np.count_nonzero(R)) + unrated = total - rated + return { + "shape": R.shape, + "total_cells": total, + "rated": rated, + "unrated": unrated, + "sparsity_pct": round(100.0 * unrated / total, 2), + "density_pct": round(100.0 * rated / total, 2), + } + + +# ── Smoke test ──────────────────────────────────────────────────────────────── + +if __name__ == "__main__": + import os, sys + base = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + sys.path.insert(0, os.path.join(base, "src")) + from data_loader import load_all + + restaurants, users, ratings = load_all(os.path.join(base, "data")) + + R = build_rating_matrix(users, restaurants, ratings) + F = build_feature_matrix(restaurants) + + print(f"Rating matrix R : {R.shape}") + print(f"Feature matrix F : {F.shape}") + + stats = sparsity_report(R) + print(f"\nSparsity report:") + for k, v in stats.items(): + print(f" {k:<18}: {v}") + + R_c, means = mean_center(R) + print(f"\nUser means (per-row average of rated entries):") + for i, mu in enumerate(means): + name = users.iloc[i]["name"] + print(f" {name:<10}: {mu:.3f}") + + # Verify mean-centering: mean of centered rated entries should be ~0 + for i in range(R.shape[0]): + mask = R[i] != 0 + if mask.any(): + assert abs(R_c[i, mask].mean()) < 1e-10, f"User {i} not centered!" + print("\nMean-centering verified: all per-user means ≈ 0 ✓") + + # Basic matrix ops + print("\nMatrix operation checks:") + A = np.array([[1.0, 2.0], [3.0, 4.0]]) + B = np.array([[5.0, 6.0], [7.0, 8.0]]) + print(f" A + B =\n{mat_add(A, B)}") + print(f" A @ B =\n{mat_multiply(A, B)}") + print(f" A^T =\n{mat_transpose(A)}") + print(f" A^-1 =\n{np.round(mat_inverse(A), 4)}") From d1250103b92fedeea01f175a8410d4edd2f4b43c Mon Sep 17 00:00:00 2001 From: meghanagrandhi Date: Sun, 12 Apr 2026 18:36:50 +0530 Subject: [PATCH 03/25] Update gaussian_elimination.py --- src/gaussian_elimination.py | 276 ++++++++++++++++++++++++++++++++++++ 1 file changed, 276 insertions(+) diff --git a/src/gaussian_elimination.py b/src/gaussian_elimination.py index 4806f84..ff92c28 100644 --- a/src/gaussian_elimination.py +++ b/src/gaussian_elimination.py @@ -1,2 +1,278 @@ """Gaussian elimination and LU decomposition routines.""" +""" +gaussian_elimination.py — LU Decomposition & Linear System Solver +================================================================== +Person 1 | TasteVector Project + +Implements LU Decomposition from scratch using only NumPy arrays. +Does NOT call scipy.linalg.lu — every step is hand-coded. + +Factorization: A = L @ U + L : lower triangular matrix (with 1s on diagonal) + U : upper triangular matrix + +Solving Ax = b in two passes: + 1. Forward substitution : solve L y = b for y + 2. Back substitution : solve U x = y for x + +Application in TasteVector +-------------------------- +Given a user's sparse preference vector b (ratings for a few restaurants) +and the feature sub-matrix A for those rated restaurants: + + A x = b → solve for x = feature weight vector + +x tells us how much the user cares about price, spice, distance, etc. +That weight vector is then dotted with every unrated restaurant's +feature vector to produce a predicted score. +""" + +import numpy as np + + +# ── LU Decomposition ───────────────────────────────────────────────────────── + +def lu_decompose(A: np.ndarray) -> tuple: + """ + Factor a square matrix A into A = L @ U using Gaussian Elimination + with partial pivoting. + + Partial pivoting: at each step, swap the current row with the row + that has the largest absolute value in the pivot column. This improves + numerical stability and avoids division by zero. + + Parameters + ---------- + A : np.ndarray, shape (n, n) — must be square and non-singular + + Returns + ------- + L : (n, n) lower triangular, diagonal entries = 1 + U : (n, n) upper triangular + P : (n, n) permutation matrix (tracks row swaps) + satisfies P @ A = L @ U + + Raises + ------ + ValueError if A is not square + np.linalg.LinAlgError if A is singular (pivot becomes 0) + """ + A = np.array(A, dtype=float) + n = A.shape[0] + + if A.ndim != 2 or A.shape[0] != A.shape[1]: + raise ValueError(f"lu_decompose requires a square matrix, got {A.shape}") + + L = np.eye(n) # start as identity; filled column by column + U = A.copy() # will be reduced to upper triangular in place + P = np.eye(n) # permutation matrix + + for col in range(n): + # ── Partial pivoting ────────────────────────────────────────────── + max_row = col + np.argmax(np.abs(U[col:, col])) + if max_row != col: + U[[col, max_row]] = U[[max_row, col]] + P[[col, max_row]] = P[[max_row, col]] + if col > 0: + L[[col, max_row], :col] = L[[max_row, col], :col] + + pivot = U[col, col] + if abs(pivot) < 1e-12: + raise np.linalg.LinAlgError( + f"Zero pivot encountered at column {col} — matrix is singular." + ) + + # ── Elimination: zero out entries below the pivot ───────────────── + for row in range(col + 1, n): + factor = U[row, col] / pivot + L[row, col] = factor + U[row, col:] -= factor * U[col, col:] + + return L, U, P + + +# ── Forward & back substitution ────────────────────────────────────────────── + +def forward_substitution(L: np.ndarray, b: np.ndarray) -> np.ndarray: + """ + Solve L y = b for y where L is lower triangular. + + y_i = (b_i - sum_{j np.ndarray: + """ + Solve U x = y for x where U is upper triangular. + + x_i = (y_i - sum_{j>i} U[i,j] * x[j]) / U[i,i] + """ + n = len(y) + x = np.zeros(n) + for i in range(n - 1, -1, -1): + x[i] = (y[i] - np.dot(U[i, i + 1:], x[i + 1:])) / U[i, i] + return x + + +# ── Combined solver ────────────────────────────────────────────────────────── + +def solve(A: np.ndarray, b: np.ndarray) -> np.ndarray: + """ + Solve the linear system A x = b using LU Decomposition. + + Steps + ----- + 1. Decompose: P A = L U + 2. Apply permutation: b' = P b + 3. Forward sub: L y = b' → y + 4. Back sub: U x = y → x + + Parameters + ---------- + A : (n, n) square non-singular matrix + b : (n,) right-hand side vector + + Returns + ------- + x : (n,) solution vector such that A @ x ≈ b + """ + b = np.array(b, dtype=float) + L, U, P = lu_decompose(A) + b_perm = P @ b # apply the same row permutations to b + y = forward_substitution(L, b_perm) + x = back_substitution(U, y) + return x + + +# ── Application: user preference weights ───────────────────────────────────── + +def solve_preference_weights(F_rated: np.ndarray, + ratings_vec: np.ndarray) -> np.ndarray: + """ + Given a user's ratings for a subset of restaurants and the feature + vectors of those restaurants, solve for the user's feature weight vector. + + System: F_rated @ x = ratings_vec + (n_rated x n_features) @ (n_features,) = (n_rated,) + + When n_rated != n_features this is over/under-determined. + We solve the Normal Equations instead: + F^T F x = F^T b (Least Squares) + + Parameters + ---------- + F_rated : (n_rated, n_features) feature matrix for rated restaurants + ratings_vec : (n_rated,) user's actual ratings + + Returns + ------- + x : (n_features,) weight vector [w_price, w_spice, w_distance, w_veg] + """ + A = F_rated.T @ F_rated # (n_features, n_features) — always square + b = F_rated.T @ ratings_vec # (n_features,) + return solve(A, b) + + +def score_restaurants(F: np.ndarray, weights: np.ndarray) -> np.ndarray: + """ + Dot every restaurant's feature vector with the user's weight vector. + + score_j = F[j] · weights + + Returns + ------- + scores : (n_restaurants,) predicted preference score per restaurant + """ + return F @ weights + + +# ── Verification helpers (used in tests) ───────────────────────────────────── + +def verify_lu(A: np.ndarray, L: np.ndarray, + U: np.ndarray, P: np.ndarray, + tol: float = 1e-8) -> bool: + """Check that P @ A ≈ L @ U within floating-point tolerance.""" + return bool(np.allclose(P @ A, L @ U, atol=tol)) + + +def residual(A: np.ndarray, x: np.ndarray, b: np.ndarray) -> float: + """||A x - b|| — should be near zero for a correct solution.""" + return float(np.linalg.norm(A @ x - b)) + + +# ── Smoke test ──────────────────────────────────────────────────────────────── + +if __name__ == "__main__": + import os, sys + base = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + sys.path.insert(0, os.path.join(base, "src")) + from data_loader import load_all + from matrix_builder import build_feature_matrix, build_rating_matrix + + print("=" * 55) + print("LU DECOMPOSITION TESTS") + print("=" * 55) + + test_systems = [ + # (A, b, label) + ( + np.array([[2., 1., -1.], [-3., -1., 2.], [-2., 1., 2.]]), + np.array([8., -11., -3.]), + "3x3 classic" + ), + ( + np.array([[1., 2., 0., 0.], [3., 4., 2., 0.], + [0., 1., 3., 1.], [0., 0., 2., 4.]]), + np.array([1., 2., 3., 4.]), + "4x4 banded" + ), + ( + np.array([[4., 3.], [6., 3.]]), + np.array([10., 12.]), + "2x2 simple" + ), + ] + + for A, b, label in test_systems: + L, U, P = lu_decompose(A) + x = solve(A, b) + lu_ok = verify_lu(A, L, U, P) + res = residual(A, x, b) + x_ref = np.linalg.solve(A, b) + print(f"\n[{label}]") + print(f" x (ours) : {np.round(x, 6)}") + print(f" x (np.linalg): {np.round(x_ref, 6)}") + print(f" LU verified : {lu_ok}") + print(f" Residual ||Ax-b||: {res:.2e}") + + print("\n" + "=" * 55) + print("PREFERENCE WEIGHT SOLVER") + print("=" * 55) + + restaurants, users, ratings = load_all(os.path.join(base, "data")) + F = build_feature_matrix(restaurants) + R = build_rating_matrix(users, restaurants, ratings) + + # User 0 (Alice) — use her rated restaurants + user_idx = 0 + rated_mask = R[user_idx] != 0 + F_rated = F[rated_mask] + ratings_vec = R[user_idx, rated_mask] + + weights = solve_preference_weights(F_rated, ratings_vec) + print(f"\nAlice's feature weights: {dict(zip(['price','spice','dist','veg'], np.round(weights,4)))}") + + scores = score_restaurants(F, weights) + top5 = np.argsort(scores)[::-1][:5] + print("\nTop-5 restaurants by weight-based score:") + for idx in top5: + name = restaurants.iloc[idx]["name"] + print(f" {name:<25} score={scores[idx]:.3f}") From d16160b3231b9262bbe80e1e2d5c24bf55334b7f Mon Sep 17 00:00:00 2001 From: meghanagrandhi Date: Sun, 12 Apr 2026 18:37:28 +0530 Subject: [PATCH 04/25] Update svd_recommender.py --- src/svd_recommender.py | 177 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 177 insertions(+) diff --git a/src/svd_recommender.py b/src/svd_recommender.py index bb6e8d6..b1c2234 100644 --- a/src/svd_recommender.py +++ b/src/svd_recommender.py @@ -1,2 +1,179 @@ """SVD-based collaborative filtering routines.""" +""" +svd_recommender.py — SVD-Based Collaborative Filtering +======================================================= +Person 3 | TasteVector Project + +Decomposes the mean-centered rating matrix R using SVD: + R = U * Sigma * V^T + +Truncated SVD (top-k singular values) reduces noise and fills in +predicted scores for unrated user-restaurant pairs. +""" + +import numpy as np + + +# ── Mean-centering ────────────────────────────────────────────────────────── + +def mean_center(R: np.ndarray) -> tuple: + """ + Subtract each user's mean rating (rated entries only) from their row. + + Parameters + ---------- + R : (n_users, n_restaurants) 0 = unrated + + Returns + ------- + R_centered : mean-centered matrix + user_means : per-user mean, shape (n_users,) + """ + R_centered = R.astype(float).copy() + user_means = np.zeros(R.shape[0]) + for i in range(R.shape[0]): + mask = R[i] != 0 + if mask.any(): + user_means[i] = R[i, mask].mean() + R_centered[i, mask] -= user_means[i] + return R_centered, user_means + + +def restore_means(R_approx: np.ndarray, user_means: np.ndarray) -> np.ndarray: + """Add per-user means back to an approximated matrix.""" + return R_approx + user_means[:, np.newaxis] + + +# ── SVD decomposition ─────────────────────────────────────────────────────── + +def decompose(R_centered: np.ndarray) -> tuple: + """ + Full SVD: R_centered = U @ diag(sigma) @ Vt + + Returns + ------- + U : (n_users, n_users) + sigma : singular values in descending order, shape (min(m,n),) + Vt : (n_restaurants, n_restaurants) + """ + U, sigma, Vt = np.linalg.svd(R_centered, full_matrices=True) + return U, sigma, Vt + + +def truncate(U, sigma, Vt, k: int) -> tuple: + """ + Keep only the top-k singular values/vectors (noise reduction). + + Returns + ------- + U_k : (n_users, k) + s_k : (k,) + Vt_k : (k, n_restaurants) + """ + k = min(k, len(sigma)) + return U[:, :k], sigma[:k], Vt[:k, :] + + +def reconstruct(U_k, s_k, Vt_k) -> np.ndarray: + """R_approx = U_k @ diag(s_k) @ Vt_k""" + return U_k @ np.diag(s_k) @ Vt_k + + +# ── Variance explained (helps choose k) ───────────────────────────────────── + +def variance_explained(sigma: np.ndarray) -> np.ndarray: + """Cumulative fraction of variance explained by each singular value.""" + sq = sigma ** 2 + return np.cumsum(sq) / sq.sum() + + +def recommended_k(sigma: np.ndarray, threshold: float = 0.80) -> int: + """Smallest k that explains at least `threshold` of total variance.""" + cumvar = variance_explained(sigma) + hits = np.where(cumvar >= threshold)[0] + return int(hits[0]) + 1 if len(hits) > 0 else len(sigma) + + +# ── High-level prediction pipeline ────────────────────────────────────────── + +def predict_ratings(R: np.ndarray, k: int = 4) -> np.ndarray: + """ + Full pipeline: mean-center → SVD → truncate → reconstruct → restore means. + + Returns + ------- + R_pred : (n_users, n_restaurants) predicted score for every cell + """ + R_centered, user_means = mean_center(R) + U, sigma, Vt = decompose(R_centered) + U_k, s_k, Vt_k = truncate(U, sigma, Vt, k) + R_approx = reconstruct(U_k, s_k, Vt_k) + return restore_means(R_approx, user_means) + + +def top_n_for_user(user_idx: int, R: np.ndarray, R_pred: np.ndarray, + n: int = 5, only_unrated: bool = True) -> list: + """ + Top-N restaurant recommendations for a single user. + + Parameters + ---------- + user_idx : row index of the target user + R : original rating matrix (to identify already-rated items) + R_pred : predicted rating matrix + only_unrated: exclude restaurants the user has already rated + + Returns + ------- + List of (restaurant_idx, predicted_score) sorted descending by score + """ + scores = R_pred[user_idx].copy() + if only_unrated: + scores[R[user_idx] != 0] = -np.inf + top_idx = np.argsort(scores)[::-1][:n] + return [(int(i), float(scores[i])) for i in top_idx] + + +# ── Evaluation ─────────────────────────────────────────────────────────────── + +def frobenius_error(R_original: np.ndarray, R_approx: np.ndarray) -> float: + """ + Frobenius norm of residuals on rated entries only. + Used in unit tests to verify SVD beats a random baseline. + """ + mask = R_original != 0 + diff = (R_original - R_approx)[mask] + return float(np.sqrt((diff ** 2).sum())) + + +# ── Smoke test ─────────────────────────────────────────────────────────────── + +if __name__ == "__main__": + import os, sys + base = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + sys.path.insert(0, os.path.join(base, "src")) + from data_loader import load_all + from matrix_builder import build_rating_matrix + + restaurants, users, ratings = load_all(os.path.join(base, "data")) + R = build_rating_matrix(users, restaurants, ratings) + + print(f"Rating matrix: {R.shape} | " + f"Sparsity: {100*np.count_nonzero(R)/R.size:.1f}% rated") + + _, sigma, _ = decompose(mean_center(R)[0]) + print("\nSingular values:", np.round(sigma[:8], 3)) + + k = recommended_k(sigma) + print(f"Recommended k (80% variance): {k}") + for i, v in enumerate(variance_explained(sigma)[:8]): + print(f" k={i+1}: {v*100:.1f}% variance") + + R_pred = predict_ratings(R, k=k) + print("\nTop-5 recs for user 0 (Alice):") + for rest_idx, score in top_n_for_user(0, R, R_pred, n=5): + name = restaurants.iloc[rest_idx]["name"] + print(f" [{rest_idx+1}] {name:<25} predicted={score:.3f}") + + print(f"\nFrobenius error (rated entries): {frobenius_error(R, R_pred):.4f}") From 620c683bd1e0ab4a35faeaa07c11f02c5cc8c309 Mon Sep 17 00:00:00 2001 From: meghanagrandhi Date: Sun, 12 Apr 2026 18:38:18 +0530 Subject: [PATCH 05/25] Add eigendecomposition and diagonalization utilities --- src/eigen_decomp.py | 215 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 215 insertions(+) diff --git a/src/eigen_decomp.py b/src/eigen_decomp.py index 788421a..66e78b0 100644 --- a/src/eigen_decomp.py +++ b/src/eigen_decomp.py @@ -1,2 +1,217 @@ """Eigendecomposition and diagonalization utilities.""" +""" +eigen_decomp.py — Eigendecomposition & Diagonalization +======================================================= +Person 3 | TasteVector Project + +Computes the covariance matrix C = F^T F of the restaurant feature matrix, +then finds eigenvalues and eigenvectors: + + C = P D P^{-1} (diagonalization) + +where D = diag(eigenvalues) and P = matrix of eigenvectors (columns). + +The top eigenvectors capture the feature directions along which restaurants +vary most (analogous to PCA principal components). These are used to +project user preference vectors into the most informative subspace. + +Also validates the Cayley-Hamilton theorem as a correctness check: + substituting eigenvalue λ into the characteristic polynomial p(λ) → 0 +""" + +import numpy as np + + +# ── Covariance matrix ──────────────────────────────────────────────────────── + +def covariance_matrix(F: np.ndarray) -> np.ndarray: + """ + Compute the feature covariance matrix C = F^T @ F. + + F : (n_restaurants, n_features) + C : (n_features, n_features) — symmetric, positive semi-definite + """ + return F.T @ F + + +# ── Eigendecomposition ─────────────────────────────────────────────────────── + +def eigen_decompose(C: np.ndarray) -> tuple: + """ + Compute eigenvalues and eigenvectors of a square matrix C. + + C v = λ v + + Returns + ------- + eigenvalues : (n,) — may be complex for non-symmetric matrices + (C = F^T F is always symmetric → always real) + eigenvectors : (n, n) — columns are eigenvectors + """ + eigenvalues, eigenvectors = np.linalg.eig(C) + + # sort descending by magnitude so the most important directions come first + order = np.argsort(np.abs(eigenvalues))[::-1] + return eigenvalues[order], eigenvectors[:, order] + + +# ── Diagonalization C = P D P^{-1} ───────────────────────────────────────── + +def diagonalize(C: np.ndarray) -> tuple: + """ + Diagonalize C = P D P^{-1}. + + Returns + ------- + P : (n, n) matrix whose columns are eigenvectors + D : (n, n) diagonal matrix of eigenvalues + P_inv: (n, n) inverse of P + """ + eigenvalues, P = eigen_decompose(C) + D = np.diag(eigenvalues) + P_inv = np.linalg.inv(P) + return P, D, P_inv + + +def verify_diagonalization(C: np.ndarray, P, D, P_inv, + tol: float = 1e-8) -> bool: + """ + Check P @ D @ P^{-1} ≈ C within floating-point tolerance. + Returns True if the reconstruction is accurate. + """ + C_reconstructed = P @ D @ P_inv + return bool(np.allclose(C_reconstructed, C, atol=tol)) + + +# ── Cayley-Hamilton theorem ────────────────────────────────────────────────── + +def cayley_hamilton_check(C: np.ndarray, tol: float = 1e-6) -> dict: + """ + Cayley-Hamilton: every matrix satisfies its own characteristic polynomial. + + The characteristic polynomial of C is det(C - λI) = 0. + We compute the coefficients via numpy and evaluate p(C) — if the theorem + holds, the result should be the zero matrix (within floating-point error). + + For each eigenvalue λ_i we also verify p(λ_i) ≈ 0. + + Returns + ------- + dict with keys: + 'matrix_check' : bool — p(C) ≈ 0 (Frobenius norm < tol * n^2) + 'eigenvalue_residuals' : array of |p(λ_i)| for each eigenvalue + 'all_pass' : bool + """ + n = C.shape[0] + coeffs = np.poly(C) # characteristic polynomial coefficients + + # Evaluate p(C) using Horner's method + pC = np.zeros_like(C, dtype=complex) + for c in coeffs: + pC = pC @ C + c * np.eye(n) + + matrix_frobenius = np.linalg.norm(pC) + matrix_ok = bool(matrix_frobenius < tol * n * n) + + # Evaluate p(λ) for each eigenvalue + eigenvalues, _ = np.linalg.eig(C) + residuals = np.array([abs(np.polyval(coeffs, lam)) for lam in eigenvalues]) + eigenvalue_ok = bool(np.all(residuals < tol)) + + return { + "matrix_check": matrix_ok, + "matrix_frobenius_norm": float(matrix_frobenius), + "eigenvalue_residuals": residuals, + "all_pass": matrix_ok and eigenvalue_ok, + } + + +# ── Top eigenvectors (PCA-style projection) ────────────────────────────────── + +def top_k_eigenvectors(C: np.ndarray, k: int) -> np.ndarray: + """ + Return the k eigenvectors corresponding to the k largest eigenvalues. + These span the directions of maximum variance in the feature space. + + Returns + ------- + E : (n_features, k) — columns are the top-k eigenvectors + """ + eigenvalues, eigenvectors = eigen_decompose(C) + return eigenvectors[:, :k] + + +def project_onto_top_k(v: np.ndarray, E: np.ndarray) -> np.ndarray: + """ + Project a preference vector v onto the subspace spanned by E. + + v_proj = E @ E^T @ v + + Parameters + ---------- + v : (n_features,) — user preference vector + E : (n_features, k) — top-k eigenvectors from top_k_eigenvectors() + + Returns + ------- + v_proj : (n_features,) — projection of v onto the top-k subspace + """ + return E @ (E.T @ v) + + +# ── Analysis report ────────────────────────────────────────────────────────── + +def eigen_report(F: np.ndarray, feature_names: list = None) -> None: + """ + Print a human-readable eigendecomposition report for the feature matrix F. + """ + C = covariance_matrix(F) + eigenvalues, eigenvectors = eigen_decompose(C) + P, D, P_inv = diagonalize(C) + + if feature_names is None: + feature_names = [f"feature_{i}" for i in range(F.shape[1])] + + print("=" * 55) + print("EIGENDECOMPOSITION REPORT") + print("=" * 55) + print(f"\nCovariance matrix C = F^T F shape: {C.shape}") + + total = np.abs(eigenvalues).sum() + print("\nEigenvalues (sorted descending):") + for i, (lam, vec) in enumerate(zip(eigenvalues, eigenvectors.T)): + pct = 100 * abs(lam) / total + top_feat = feature_names[np.argmax(np.abs(vec))] + print(f" λ_{i+1} = {lam.real:8.3f} ({pct:5.1f}%) " + f"dominant feature: {top_feat}") + + diag_ok = verify_diagonalization(C, P, D, P_inv) + print(f"\nDiagonalization C = P D P^{{-1}} verified: {diag_ok}") + + ch = cayley_hamilton_check(C) + print(f"Cayley-Hamilton check passed: {ch['all_pass']}") + print(f" p(C) Frobenius norm: {ch['matrix_frobenius_norm']:.2e}") + + +# ── Smoke test ─────────────────────────────────────────────────────────────── + +if __name__ == "__main__": + import os, sys + base = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + sys.path.insert(0, os.path.join(base, "src")) + from data_loader import load_all + + restaurants, _, _ = load_all(os.path.join(base, "data")) + F = restaurants[["price", "spice", "distance_km", "veg_friendly"]].to_numpy(dtype=float) + feature_names = ["price", "spice", "distance_km", "veg_friendly"] + + eigen_report(F, feature_names) + + C = covariance_matrix(F) + E = top_k_eigenvectors(C, k=2) + print(f"\nTop-2 eigenvectors shape: {E.shape}") + v = np.array([3.0, 4.0, 2.0, 1.0]) # example user preference vector + v_proj = project_onto_top_k(v, E) + print(f"User pref vector: {v}") + print(f"Projected onto k=2 subspace: {np.round(v_proj, 4)}") From ec198ce2b5275f949c1115569d195b717bb4a5a2 Mon Sep 17 00:00:00 2001 From: meghanagrandhi Date: Sun, 12 Apr 2026 18:38:47 +0530 Subject: [PATCH 06/25] Update pagerank_ranker.py --- src/pagerank_ranker.py | 186 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 186 insertions(+) diff --git a/src/pagerank_ranker.py b/src/pagerank_ranker.py index b84ed86..8291def 100644 --- a/src/pagerank_ranker.py +++ b/src/pagerank_ranker.py @@ -1,2 +1,188 @@ """PageRank-style restaurant ranking utilities.""" +""" +pagerank_ranker.py — PageRank-Style Restaurant Ranking +====================================================== +Person 3 | TasteVector Project + +Builds a restaurant-to-restaurant similarity graph where: + - Each node = a restaurant + - Edge weight = cosine similarity between the two restaurants' feature vectors + +The graph is represented as an (n_restaurants x n_restaurants) matrix G. + +A PageRank-style score is derived from the dominant eigenvector of G +(the eigenvector corresponding to the largest eigenvalue). + +Restaurants that are highly similar to many other well-regarded restaurants +receive a high PageRank score. This global importance signal is combined +with the SVD collaborative-filtering score in recommender.py. +""" + +import numpy as np + + +# ── Cosine similarity helpers ──────────────────────────────────────────────── + +def cosine_similarity(u: np.ndarray, v: np.ndarray) -> float: + """ + sim(u, v) = (u · v) / (||u|| * ||v||) + Returns 0.0 if either vector is the zero vector. + """ + norm_u = np.linalg.norm(u) + norm_v = np.linalg.norm(v) + if norm_u == 0.0 or norm_v == 0.0: + return 0.0 + return float(np.dot(u, v) / (norm_u * norm_v)) + + +# ── Similarity graph construction ──────────────────────────────────────────── + +def build_similarity_graph(F: np.ndarray) -> np.ndarray: + """ + Build a restaurant-to-restaurant similarity graph G. + + G[i, j] = cosine_similarity(F[i], F[j]) + + Parameters + ---------- + F : (n_restaurants, n_features) + + Returns + ------- + G : (n_restaurants, n_restaurants) symmetric, values in [0, 1] + diagonal entries set to 0 (a restaurant is not similar to itself + for ranking purposes) + """ + n = F.shape[0] + G = np.zeros((n, n)) + for i in range(n): + for j in range(i + 1, n): + sim = cosine_similarity(F[i], F[j]) + G[i, j] = sim + G[j, i] = sim # symmetric + return G + + +def normalize_graph(G: np.ndarray) -> np.ndarray: + """ + Row-normalize G so each row sums to 1 (stochastic matrix). + Rows that are all-zero are left as zero (isolated nodes). + This is analogous to the PageRank transition matrix. + """ + row_sums = G.sum(axis=1, keepdims=True) + row_sums[row_sums == 0] = 1 # avoid division by zero + return G / row_sums + + +# ── PageRank via dominant eigenvector ──────────────────────────────────────── + +def dominant_eigenvector(M: np.ndarray) -> np.ndarray: + """ + Find the dominant eigenvector of matrix M using numpy.linalg.eig. + + The dominant eigenvector corresponds to the eigenvalue with the + largest absolute value. For a non-negative stochastic matrix this + is the Perron-Frobenius eigenvector (all non-negative entries). + + Returns + ------- + v : (n,) real-valued dominant eigenvector, L1-normalized so scores + sum to 1 and are interpretable as probabilities. + """ + eigenvalues, eigenvectors = np.linalg.eig(M) + idx = np.argmax(np.abs(eigenvalues)) + v = eigenvectors[:, idx].real # take real part (tiny imaginary residuals) + v = np.abs(v) # Perron-Frobenius: entries should be ≥ 0 + v /= v.sum() if v.sum() != 0 else 1 # L1-normalize → interpretable scores + return v + + +def pagerank_scores(F: np.ndarray) -> np.ndarray: + """ + Full pipeline: build graph → normalize → dominant eigenvector. + + Parameters + ---------- + F : (n_restaurants, n_features) + + Returns + ------- + scores : (n_restaurants,) PageRank importance score per restaurant, + normalized to sum to 1. + """ + G = build_similarity_graph(F) + G_norm = normalize_graph(G) + return dominant_eigenvector(G_norm) + + +# ── Power iteration (alternative / validation) ─────────────────────────────── + +def power_iteration(M: np.ndarray, n_iter: int = 100, + tol: float = 1e-10) -> np.ndarray: + """ + Compute the dominant eigenvector via power iteration. + More numerically stable for large matrices; useful for validating + the numpy.linalg.eig result. + + Starts from a uniform vector and repeatedly multiplies by M^T, + converging to the stationary distribution of the Markov chain. + """ + n = M.shape[0] + v = np.ones(n) / n # uniform start + for _ in range(n_iter): + v_new = M.T @ v + norm = np.linalg.norm(v_new) + if norm == 0: + break + v_new /= norm + if np.linalg.norm(v_new - v) < tol: + break + v = v_new + v = np.abs(v) + v /= v.sum() if v.sum() != 0 else 1 + return v + + +# ── Ranked restaurant list ──────────────────────────────────────────────────── + +def rank_restaurants(scores: np.ndarray) -> list: + """ + Return restaurant indices sorted by PageRank score (descending). + + Returns + ------- + List of (restaurant_idx, score) tuples + """ + order = np.argsort(scores)[::-1] + return [(int(i), float(scores[i])) for i in order] + + +# ── Smoke test ─────────────────────────────────────────────────────────────── + +if __name__ == "__main__": + import os, sys + base = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + sys.path.insert(0, os.path.join(base, "src")) + from data_loader import load_all + + restaurants, _, _ = load_all(os.path.join(base, "data")) + F = restaurants[["price", "spice", "distance_km", "veg_friendly"]].to_numpy(dtype=float) + + G = build_similarity_graph(F) + print(f"Similarity graph shape: {G.shape}") + print(f"Mean edge weight: {G[G > 0].mean():.4f}") + + scores_eig = pagerank_scores(F) + G_norm = normalize_graph(G) + scores_pow = power_iteration(G_norm) + + print(f"\nCorrelation (eig vs power iter): " + f"{np.corrcoef(scores_eig, scores_pow)[0,1]:.6f}") + + print("\nPageRank Restaurant Ranking:") + print(f"{'Rank':<6}{'Name':<25}{'Score':>8}") + print("-" * 40) + for rank, (idx, score) in enumerate(rank_restaurants(scores_eig), 1): + name = restaurants.iloc[idx]["name"] + print(f"{rank:<6}{name:<25}{score:>8.5f}") From df82dd68cff1c50ede92ca4b2046cea1dd09aa3c Mon Sep 17 00:00:00 2001 From: meghanagrandhi Date: Sun, 12 Apr 2026 18:39:09 +0530 Subject: [PATCH 07/25] Update recommender.py --- src/recommender.py | 300 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 300 insertions(+) diff --git a/src/recommender.py b/src/recommender.py index 4333e86..6eb4516 100644 --- a/src/recommender.py +++ b/src/recommender.py @@ -1,2 +1,302 @@ """Main TasteVector recommendation pipeline.""" +""" +recommender.py — Main Recommendation Pipeline +============================================= +Person 3 | TasteVector Project + +Orchestrates the full recommendation pipeline: + + Existing user → SVD collaborative filtering + PageRank blending + New user → Least Squares projection (cold-start) + PageRank blending + +The final score is a weighted combination: + final_score = α * svd_score + β * pagerank_score + γ * projection_score + +Filters are applied post-scoring to remove restaurants that exceed the +user's max_price or max_distance constraints. +""" + +import numpy as np +import pandas as pd +import os, sys + +# Allow running this file directly from the src/ directory +_src = os.path.dirname(os.path.abspath(__file__)) +if _src not in sys.path: + sys.path.insert(0, _src) + +from svd_recommender import predict_ratings, top_n_for_user, mean_center, decompose, recommended_k +from eigen_decomp import covariance_matrix, top_k_eigenvectors, project_onto_top_k +from pagerank_ranker import pagerank_scores, build_similarity_graph, normalize_graph + + +# ── Weights for score blending ─────────────────────────────────────────────── +ALPHA = 0.60 # SVD collaborative filtering weight +BETA = 0.25 # PageRank global importance weight +GAMMA = 0.15 # Cold-start projection weight (only for new users) + + +# ── Feature matrix builder (inline; avoids circular imports) ───────────────── + +def _build_feature_matrix(restaurants: pd.DataFrame) -> np.ndarray: + cols = ["price", "spice", "distance_km", "veg_friendly"] + return restaurants[cols].to_numpy(dtype=float) + + +# ── Constraint filtering ───────────────────────────────────────────────────── + +def apply_constraints(restaurants: pd.DataFrame, scores: np.ndarray, + max_price: int = None, + max_distance: float = None) -> np.ndarray: + """ + Set scores to -inf for restaurants that violate hard constraints. + This ensures they never appear in the top-N list. + """ + filtered = scores.copy() + if max_price is not None: + mask = restaurants["price"].to_numpy() > max_price + filtered[mask] = -np.inf + if max_distance is not None: + mask = restaurants["distance_km"].to_numpy() > max_distance + filtered[mask] = -np.inf + return filtered + + +# ── Existing-user recommendation ───────────────────────────────────────────── + +def recommend_for_existing_user( + user_idx: int, + R: np.ndarray, + restaurants: pd.DataFrame, + n: int = 5, + k: int = None, + max_price: int = None, + max_distance: float = None, +) -> list: + """ + Recommendation pipeline for a user with rating history. + + Parameters + ---------- + user_idx : row index in R + R : (n_users, n_restaurants) raw rating matrix + restaurants : DataFrame with restaurant metadata + n : number of results to return + k : SVD latent dimensions (auto-selected if None) + max_price : hard constraint on price (1-5) + max_distance : hard constraint on distance (km) + + Returns + ------- + List of dicts, each with restaurant metadata + scores + """ + F = _build_feature_matrix(restaurants) + + # ── SVD score ────────────────────────────────────────────────────────── + R_centered, user_means = mean_center(R) + _, sigma, _ = decompose(R_centered) + if k is None: + k = recommended_k(sigma) + R_pred = predict_ratings(R, k=k) + svd_scores = R_pred[user_idx] + + # Normalize SVD scores to [0, 1] + svd_min, svd_max = svd_scores.min(), svd_scores.max() + if svd_max > svd_min: + svd_norm = (svd_scores - svd_min) / (svd_max - svd_min) + else: + svd_norm = np.zeros_like(svd_scores) + + # ── PageRank score ───────────────────────────────────────────────────── + pr_scores = pagerank_scores(F) # already normalized to sum=1 + + # ── Blend ────────────────────────────────────────────────────────────── + final = ALPHA * svd_norm + BETA * pr_scores + + # ── Apply constraints ────────────────────────────────────────────────── + final = apply_constraints(restaurants, final, max_price, max_distance) + + # ── Exclude already-rated restaurants ────────────────────────────────── + final[R[user_idx] != 0] = -np.inf + + # ── Collect top-N ────────────────────────────────────────────────────── + top_idx = np.argsort(final)[::-1][:n] + return _format_results(top_idx, final, svd_norm, pr_scores, restaurants) + + +# ── New-user recommendation (cold-start) ───────────────────────────────────── + +def recommend_for_new_user( + preferences: np.ndarray, + R: np.ndarray, + restaurants: pd.DataFrame, + n: int = 5, + max_price: int = None, + max_distance: float = None, +) -> list: + """ + Cold-start recommendation: user has no rating history. + + Strategy + -------- + 1. Project the user preference vector onto the top-k eigenvectors of + the restaurant feature covariance matrix (Gram-Schmidt / Least Squares). + 2. Compute cosine similarity between the projected vector and each restaurant. + 3. Blend with PageRank global importance score. + + Parameters + ---------- + preferences : (n_features,) user's stated preferences + [price, spice, distance_km, veg_friendly] + """ + F = _build_feature_matrix(restaurants) + n_rest = F.shape[0] + + # ── Eigendecomposition of feature covariance ─────────────────────────── + C = covariance_matrix(F) + k = min(3, C.shape[0]) + E = top_k_eigenvectors(C, k=k) + + # ── Project user preference vector ───────────────────────────────────── + v_proj = project_onto_top_k(preferences, E) + + # ── Cosine similarity of projected vector vs each restaurant ─────────── + proj_scores = np.array([ + _cosine(v_proj, F[j]) for j in range(n_rest) + ]) + + # ── PageRank score ───────────────────────────────────────────────────── + pr_scores = pagerank_scores(F) + + # ── Blend ────────────────────────────────────────────────────────────── + # For a new user there are no SVD scores, so GAMMA replaces ALPHA + final = (ALPHA + GAMMA) * proj_scores + BETA * pr_scores + + # ── Apply constraints ────────────────────────────────────────────────── + final = apply_constraints(restaurants, final, max_price, max_distance) + + top_idx = np.argsort(final)[::-1][:n] + return _format_results(top_idx, final, proj_scores, pr_scores, restaurants) + + +def _cosine(u: np.ndarray, v: np.ndarray) -> float: + nu, nv = np.linalg.norm(u), np.linalg.norm(v) + if nu == 0 or nv == 0: + return 0.0 + return float(np.dot(u, v) / (nu * nv)) + + +# ── Result formatter ───────────────────────────────────────────────────────── + +def _format_results(top_idx, final_scores, content_scores, + pr_scores, restaurants) -> list: + results = [] + for rank, idx in enumerate(top_idx, 1): + if final_scores[idx] == -np.inf: + continue + row = restaurants.iloc[idx] + results.append({ + "rank": rank, + "restaurant_id": int(row["restaurant_id"]), + "name": row["name"], + "cuisine": row["cuisine"], + "price": int(row["price"]), + "spice": int(row["spice"]), + "distance_km": float(row["distance_km"]), + "veg_friendly": bool(row["veg_friendly"]), + "final_score": round(float(final_scores[idx]), 4), + "content_score": round(float(content_scores[idx]), 4), + "pagerank_score": round(float(pr_scores[idx]), 4), + }) + return results + + +# ── Public entry point (called by api.py) ──────────────────────────────────── + +def get_recommendations( + user_id: int | None, + preferences: dict, + R: np.ndarray, + users: pd.DataFrame, + restaurants: pd.DataFrame, + top_n: int = 5, +) -> list: + """ + Main entry point. Called by the Flask API. + + Parameters + ---------- + user_id : existing user ID, or None for a new/anonymous user + preferences : dict with keys: max_price, spice_tolerance, max_distance + (cuisine is used for filtering in the API layer, not here) + R : rating matrix built by matrix_builder.py + users : users DataFrame + restaurants : restaurants DataFrame + top_n : number of recommendations + + Returns + ------- + List of recommendation dicts (see _format_results) + """ + max_price = preferences.get("max_price") + max_distance = preferences.get("max_distance") + spice = preferences.get("spice_tolerance", 3) + + if user_id is not None: + # Map user_id → row index + uid_list = users["user_id"].tolist() + if user_id in uid_list: + user_idx = uid_list.index(user_id) + return recommend_for_existing_user( + user_idx, R, restaurants, n=top_n, + max_price=max_price, max_distance=max_distance, + ) + + # New / anonymous user — use preference vector + price_pref = max_price if max_price else 3 + dist_pref = max_distance if max_distance else 3.0 + pref_vec = np.array([price_pref, spice, dist_pref, 1.0], dtype=float) + + return recommend_for_new_user( + pref_vec, R, restaurants, n=top_n, + max_price=max_price, max_distance=max_distance, + ) + + +# ── Smoke test ─────────────────────────────────────────────────────────────── + +if __name__ == "__main__": + base = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + sys.path.insert(0, os.path.join(base, "src")) + from data_loader import load_all + from matrix_builder import build_rating_matrix + + restaurants, users, ratings = load_all(os.path.join(base, "data")) + R = build_rating_matrix(users, restaurants, ratings) + + print("=" * 55) + print("EXISTING USER — Alice (user_id=1)") + print("=" * 55) + recs = get_recommendations( + user_id=1, + preferences={"max_price": 4, "max_distance": 4.0, "spice_tolerance": 5}, + R=R, users=users, restaurants=restaurants, top_n=5, + ) + for r in recs: + print(f" #{r['rank']} {r['name']:<25} " + f"score={r['final_score']:.4f} " + f"(svd={r['content_score']:.4f}, pr={r['pagerank_score']:.4f})") + + print("\n" + "=" * 55) + print("NEW USER — no history, prefers cheap + spicy + nearby") + print("=" * 55) + recs_new = get_recommendations( + user_id=None, + preferences={"max_price": 2, "max_distance": 2.0, "spice_tolerance": 5}, + R=R, users=users, restaurants=restaurants, top_n=5, + ) + for r in recs_new: + print(f" #{r['rank']} {r['name']:<25} " + f"score={r['final_score']:.4f} " + f"(proj={r['content_score']:.4f}, pr={r['pagerank_score']:.4f})") From 09f149a5ac4712388b72a9acd902db803bbb6907 Mon Sep 17 00:00:00 2001 From: meghanagrandhi Date: Mon, 13 Apr 2026 22:27:31 +0530 Subject: [PATCH 08/25] Add files via upload From b44d65653c9ef091569c579106e566215ba2bfb3 Mon Sep 17 00:00:00 2001 From: meghanagrandhi Date: Mon, 13 Apr 2026 22:28:11 +0530 Subject: [PATCH 09/25] Add files via upload From 63fc6b4aa07deb495abcca8dc2ee11ffedc5e048 Mon Sep 17 00:00:00 2001 From: meghanagrandhi Date: Mon, 13 Apr 2026 22:34:54 +0530 Subject: [PATCH 10/25] Add files via upload From 19d9a77ea340828878f6d589872c33ed160a535f Mon Sep 17 00:00:00 2001 From: meghanagrandhi Date: Mon, 13 Apr 2026 22:46:58 +0530 Subject: [PATCH 11/25] csv files addded From d39b336630268ea7030e8d5790582165046fac9b Mon Sep 17 00:00:00 2001 From: meghanagrandhi Date: Mon, 13 Apr 2026 22:47:49 +0530 Subject: [PATCH 12/25] Delete data directory --- data/ratings.csv | 1 - data/restaurants.csv | 1 - data/users.csv | 1 - 3 files changed, 3 deletions(-) delete mode 100644 data/ratings.csv delete mode 100644 data/restaurants.csv delete mode 100644 data/users.csv diff --git a/data/ratings.csv b/data/ratings.csv deleted file mode 100644 index ef17bca..0000000 --- a/data/ratings.csv +++ /dev/null @@ -1 +0,0 @@ -user_id,restaurant_id,rating diff --git a/data/restaurants.csv b/data/restaurants.csv deleted file mode 100644 index b230675..0000000 --- a/data/restaurants.csv +++ /dev/null @@ -1 +0,0 @@ -restaurant_id,name,cuisine,price,spice,distance_km,veg_friendly diff --git a/data/users.csv b/data/users.csv deleted file mode 100644 index f44dd20..0000000 --- a/data/users.csv +++ /dev/null @@ -1 +0,0 @@ -user_id,name,preferred_cuisine,max_price,spice_tolerance,max_distance From 400e2cc18a5dd3a07fd61abf6fd2b579ecc64600 Mon Sep 17 00:00:00 2001 From: meghanagrandhi Date: Mon, 13 Apr 2026 22:48:07 +0530 Subject: [PATCH 13/25] Create data --- data | 1 + 1 file changed, 1 insertion(+) create mode 100644 data diff --git a/data b/data new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/data @@ -0,0 +1 @@ + From ce53e9b720929f07a1a909e343f34f20dc154de2 Mon Sep 17 00:00:00 2001 From: meghanagrandhi Date: Mon, 13 Apr 2026 22:51:48 +0530 Subject: [PATCH 14/25] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 04d869a..3bf3d69 100644 --- a/README.md +++ b/README.md @@ -28,3 +28,4 @@ Run tests: ```bash pytest tests/ -v ``` +test123 From fa198f02fb0cce9e567d1ed6c7cc98836c96e017 Mon Sep 17 00:00:00 2001 From: meghanagrandhi Date: Mon, 13 Apr 2026 23:18:08 +0530 Subject: [PATCH 15/25] Add files via upload --- ratings.csv | 1 + restaurants.csv | 1 + users.csv | 1 + 3 files changed, 3 insertions(+) create mode 100644 ratings.csv create mode 100644 restaurants.csv create mode 100644 users.csv diff --git a/ratings.csv b/ratings.csv new file mode 100644 index 0000000..ef17bca --- /dev/null +++ b/ratings.csv @@ -0,0 +1 @@ +user_id,restaurant_id,rating diff --git a/restaurants.csv b/restaurants.csv new file mode 100644 index 0000000..b230675 --- /dev/null +++ b/restaurants.csv @@ -0,0 +1 @@ +restaurant_id,name,cuisine,price,spice,distance_km,veg_friendly diff --git a/users.csv b/users.csv new file mode 100644 index 0000000..f44dd20 --- /dev/null +++ b/users.csv @@ -0,0 +1 @@ +user_id,name,preferred_cuisine,max_price,spice_tolerance,max_distance From 5710d71d8233d9fd75ddabd1d5b42884323a492f Mon Sep 17 00:00:00 2001 From: meghanagrandhi Date: Mon, 13 Apr 2026 23:20:11 +0530 Subject: [PATCH 16/25] Add files via upload --- ratings.csv | 51 +++++++++++++++++++++++++++++++++++++++++++++++++ restaurants.csv | 20 +++++++++++++++++++ users (1).csv | 11 +++++++++++ 3 files changed, 82 insertions(+) create mode 100644 users (1).csv diff --git a/ratings.csv b/ratings.csv index ef17bca..c6b75c5 100644 --- a/ratings.csv +++ b/ratings.csv @@ -1 +1,52 @@ user_id,restaurant_id,rating +1,1,5.0 +1,7,4.5 +1,15,4.8 +1,4,3.5 +2,2,4.5 +2,14,4.0 +2,8,3.5 +2,3,3.0 +3,3,5.0 +3,16,4.5 +3,18,3.5 +4,4,5.0 +4,12,4.0 +4,19,3.5 +4,1,3.0 +5,9,5.0 +5,17,4.5 +5,11,4.0 +5,20,3.0 +6,6,4.5 +6,13,4.0 +6,20,4.5 +6,10,3.5 +7,8,5.0 +7,18,4.5 +7,3,4.0 +7,11,3.5 +8,10,5.0 +8,1,4.0 +8,19,4.5 +8,7,3.5 +9,11,5.0 +9,17,4.5 +9,9,4.0 +9,6,3.0 +10,19,5.0 +10,10,4.0 +10,1,3.5 +10,16,4.0 +1,9,4.0 +2,6,3.0 +3,11,4.0 +4,6,3.5 +5,12,4.0 +6,19,4.0 +7,9,4.5 +8,6,4.0 +9,13,3.5 +10,4,3.0 +1,16,3.0 +2,20,2.5 diff --git a/restaurants.csv b/restaurants.csv index b230675..b94e665 100644 --- a/restaurants.csv +++ b/restaurants.csv @@ -1 +1,21 @@ restaurant_id,name,cuisine,price,spice,distance_km,veg_friendly +1,Spice Garden,Indian,2,5,1.2,1 +2,The Burger Joint,American,2,2,0.8,0 +3,Sakura Sushi,Japanese,4,1,3.5,0 +4,Casa Mexico,Mexican,2,4,2.1,1 +5,Le Petit Bistro,French,5,1,5.0,0 +6,Noodle House,Chinese,2,3,1.5,1 +7,Tandoor Palace,Indian,3,5,2.8,1 +8,Pizza Roma,Italian,3,2,1.0,0 +9,Green Bowl,Vegan,3,2,0.5,1 +10,Seoul BBQ,Korean,3,4,3.2,0 +11,Mezze Corner,Mediterranean,3,2,2.5,1 +12,Taco Town,Mexican,1,3,0.7,1 +13,Dragon Wok,Chinese,2,4,1.8,1 +14,The Steakhouse,American,5,1,4.5,0 +15,Curry Leaf,Indian,2,4,1.1,1 +16,Ramen Republic,Japanese,3,3,2.3,0 +17,Falafel King,Mediterranean,1,2,0.9,1 +18,Pasta Palace,Italian,3,1,2.0,0 +19,Thai Orchid,Thai,3,4,3.8,1 +20,Dim Sum Den,Chinese,2,2,1.4,1 diff --git a/users (1).csv b/users (1).csv new file mode 100644 index 0000000..4d57058 --- /dev/null +++ b/users (1).csv @@ -0,0 +1,11 @@ +user_id,name,preferred_cuisine,max_price,spice_tolerance,max_distance +1,Alice,Indian,3,5,3.0 +2,Bob,American,4,2,5.0 +3,Carol,Japanese,5,2,4.0 +4,Dave,Mexican,3,4,3.0 +5,Eve,Vegan,4,3,2.0 +6,Frank,Chinese,3,4,4.0 +7,Grace,Italian,4,2,3.0 +8,Heidi,Korean,4,5,5.0 +9,Ivan,Mediterranean,3,3,3.0 +10,Judy,Thai,3,4,4.0 From cc4996554e823edc0557f591a8250db644ed86d8 Mon Sep 17 00:00:00 2001 From: meghanagrandhi Date: Mon, 13 Apr 2026 23:20:38 +0530 Subject: [PATCH 17/25] Delete users.csv --- users.csv | 1 - 1 file changed, 1 deletion(-) delete mode 100644 users.csv diff --git a/users.csv b/users.csv deleted file mode 100644 index f44dd20..0000000 --- a/users.csv +++ /dev/null @@ -1 +0,0 @@ -user_id,name,preferred_cuisine,max_price,spice_tolerance,max_distance From 4843aad5a702e846ac0151e3fb53ba2455d360ae Mon Sep 17 00:00:00 2001 From: meghanagrandhi Date: Mon, 13 Apr 2026 23:20:56 +0530 Subject: [PATCH 18/25] Rename users (1).csv to users .csv --- users (1).csv => users .csv | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename users (1).csv => users .csv (100%) diff --git a/users (1).csv b/users .csv similarity index 100% rename from users (1).csv rename to users .csv From 6dc6f166715a82c5765aa8e6efda5afa549876d4 Mon Sep 17 00:00:00 2001 From: meghanagrandhi Date: Mon, 13 Apr 2026 23:22:09 +0530 Subject: [PATCH 19/25] Delete data --- data | 1 - 1 file changed, 1 deletion(-) delete mode 100644 data diff --git a/data b/data deleted file mode 100644 index 8b13789..0000000 --- a/data +++ /dev/null @@ -1 +0,0 @@ - From b2d9f7f677be8967248c5e32e4afba1df4c7f2aa Mon Sep 17 00:00:00 2001 From: meghanagrandhi Date: Mon, 13 Apr 2026 23:22:23 +0530 Subject: [PATCH 20/25] Rename users .csv to data/users .csv --- users .csv => data/users .csv | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename users .csv => data/users .csv (100%) diff --git a/users .csv b/data/users .csv similarity index 100% rename from users .csv rename to data/users .csv From 67fd2396a83dab588d59bb082c0e42bf6a0d8933 Mon Sep 17 00:00:00 2001 From: meghanagrandhi Date: Mon, 13 Apr 2026 23:23:14 +0530 Subject: [PATCH 21/25] Delete data directory --- data/users .csv | 11 ----------- 1 file changed, 11 deletions(-) delete mode 100644 data/users .csv diff --git a/data/users .csv b/data/users .csv deleted file mode 100644 index 4d57058..0000000 --- a/data/users .csv +++ /dev/null @@ -1,11 +0,0 @@ -user_id,name,preferred_cuisine,max_price,spice_tolerance,max_distance -1,Alice,Indian,3,5,3.0 -2,Bob,American,4,2,5.0 -3,Carol,Japanese,5,2,4.0 -4,Dave,Mexican,3,4,3.0 -5,Eve,Vegan,4,3,2.0 -6,Frank,Chinese,3,4,4.0 -7,Grace,Italian,4,2,3.0 -8,Heidi,Korean,4,5,5.0 -9,Ivan,Mediterranean,3,3,3.0 -10,Judy,Thai,3,4,4.0 From 505a653672cccdfab49e17afaac1e5bf77ddcd14 Mon Sep 17 00:00:00 2001 From: meghanagrandhi Date: Mon, 13 Apr 2026 23:23:47 +0530 Subject: [PATCH 22/25] Rename ratings.csv to data/ratings.csv --- ratings.csv => data/ratings.csv | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename ratings.csv => data/ratings.csv (100%) diff --git a/ratings.csv b/data/ratings.csv similarity index 100% rename from ratings.csv rename to data/ratings.csv From 299843f6d6085e7cf51d1e5118e8f6c032f487cc Mon Sep 17 00:00:00 2001 From: meghanagrandhi Date: Tue, 14 Apr 2026 16:33:01 +0530 Subject: [PATCH 23/25] Rename restaurants.csv to data/restaurants.csv --- restaurants.csv => data/restaurants.csv | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename restaurants.csv => data/restaurants.csv (100%) diff --git a/restaurants.csv b/data/restaurants.csv similarity index 100% rename from restaurants.csv rename to data/restaurants.csv From b9f6f42b350c4de8828ba8ebec58099f300be74b Mon Sep 17 00:00:00 2001 From: meghanagrandhi Date: Tue, 14 Apr 2026 16:34:08 +0530 Subject: [PATCH 24/25] Add files via upload --- data/users.csv | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 data/users.csv diff --git a/data/users.csv b/data/users.csv new file mode 100644 index 0000000..4d57058 --- /dev/null +++ b/data/users.csv @@ -0,0 +1,11 @@ +user_id,name,preferred_cuisine,max_price,spice_tolerance,max_distance +1,Alice,Indian,3,5,3.0 +2,Bob,American,4,2,5.0 +3,Carol,Japanese,5,2,4.0 +4,Dave,Mexican,3,4,3.0 +5,Eve,Vegan,4,3,2.0 +6,Frank,Chinese,3,4,4.0 +7,Grace,Italian,4,2,3.0 +8,Heidi,Korean,4,5,5.0 +9,Ivan,Mediterranean,3,3,3.0 +10,Judy,Thai,3,4,4.0 From 6862eeaf4cd5d635a5fa12dda843df3afd7eb78c Mon Sep 17 00:00:00 2001 From: meghanagrandhi Date: Thu, 16 Apr 2026 11:18:15 +0530 Subject: [PATCH 25/25] Delete data directory --- data/ratings.csv | 52 -------------------------------------------- data/restaurants.csv | 21 ------------------ data/users.csv | 11 ---------- 3 files changed, 84 deletions(-) delete mode 100644 data/ratings.csv delete mode 100644 data/restaurants.csv delete mode 100644 data/users.csv diff --git a/data/ratings.csv b/data/ratings.csv deleted file mode 100644 index c6b75c5..0000000 --- a/data/ratings.csv +++ /dev/null @@ -1,52 +0,0 @@ -user_id,restaurant_id,rating -1,1,5.0 -1,7,4.5 -1,15,4.8 -1,4,3.5 -2,2,4.5 -2,14,4.0 -2,8,3.5 -2,3,3.0 -3,3,5.0 -3,16,4.5 -3,18,3.5 -4,4,5.0 -4,12,4.0 -4,19,3.5 -4,1,3.0 -5,9,5.0 -5,17,4.5 -5,11,4.0 -5,20,3.0 -6,6,4.5 -6,13,4.0 -6,20,4.5 -6,10,3.5 -7,8,5.0 -7,18,4.5 -7,3,4.0 -7,11,3.5 -8,10,5.0 -8,1,4.0 -8,19,4.5 -8,7,3.5 -9,11,5.0 -9,17,4.5 -9,9,4.0 -9,6,3.0 -10,19,5.0 -10,10,4.0 -10,1,3.5 -10,16,4.0 -1,9,4.0 -2,6,3.0 -3,11,4.0 -4,6,3.5 -5,12,4.0 -6,19,4.0 -7,9,4.5 -8,6,4.0 -9,13,3.5 -10,4,3.0 -1,16,3.0 -2,20,2.5 diff --git a/data/restaurants.csv b/data/restaurants.csv deleted file mode 100644 index b94e665..0000000 --- a/data/restaurants.csv +++ /dev/null @@ -1,21 +0,0 @@ -restaurant_id,name,cuisine,price,spice,distance_km,veg_friendly -1,Spice Garden,Indian,2,5,1.2,1 -2,The Burger Joint,American,2,2,0.8,0 -3,Sakura Sushi,Japanese,4,1,3.5,0 -4,Casa Mexico,Mexican,2,4,2.1,1 -5,Le Petit Bistro,French,5,1,5.0,0 -6,Noodle House,Chinese,2,3,1.5,1 -7,Tandoor Palace,Indian,3,5,2.8,1 -8,Pizza Roma,Italian,3,2,1.0,0 -9,Green Bowl,Vegan,3,2,0.5,1 -10,Seoul BBQ,Korean,3,4,3.2,0 -11,Mezze Corner,Mediterranean,3,2,2.5,1 -12,Taco Town,Mexican,1,3,0.7,1 -13,Dragon Wok,Chinese,2,4,1.8,1 -14,The Steakhouse,American,5,1,4.5,0 -15,Curry Leaf,Indian,2,4,1.1,1 -16,Ramen Republic,Japanese,3,3,2.3,0 -17,Falafel King,Mediterranean,1,2,0.9,1 -18,Pasta Palace,Italian,3,1,2.0,0 -19,Thai Orchid,Thai,3,4,3.8,1 -20,Dim Sum Den,Chinese,2,2,1.4,1 diff --git a/data/users.csv b/data/users.csv deleted file mode 100644 index 4d57058..0000000 --- a/data/users.csv +++ /dev/null @@ -1,11 +0,0 @@ -user_id,name,preferred_cuisine,max_price,spice_tolerance,max_distance -1,Alice,Indian,3,5,3.0 -2,Bob,American,4,2,5.0 -3,Carol,Japanese,5,2,4.0 -4,Dave,Mexican,3,4,3.0 -5,Eve,Vegan,4,3,2.0 -6,Frank,Chinese,3,4,4.0 -7,Grace,Italian,4,2,3.0 -8,Heidi,Korean,4,5,5.0 -9,Ivan,Mediterranean,3,3,3.0 -10,Judy,Thai,3,4,4.0