Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
92a1e85
Implement data loading and validation for CSV files
meghanagrandhi Apr 12, 2026
3736fac
Add matrix construction utilities for TasteVector
meghanagrandhi Apr 12, 2026
d125010
Update gaussian_elimination.py
meghanagrandhi Apr 12, 2026
d16160b
Update svd_recommender.py
meghanagrandhi Apr 12, 2026
620c683
Add eigendecomposition and diagonalization utilities
meghanagrandhi Apr 12, 2026
ec198ce
Update pagerank_ranker.py
meghanagrandhi Apr 12, 2026
df82dd6
Update recommender.py
meghanagrandhi Apr 12, 2026
09f149a
Add files via upload
meghanagrandhi Apr 13, 2026
b44d656
Add files via upload
meghanagrandhi Apr 13, 2026
63fc6b4
Add files via upload
meghanagrandhi Apr 13, 2026
19d9a77
csv files addded
meghanagrandhi Apr 13, 2026
d39b336
Delete data directory
meghanagrandhi Apr 13, 2026
400e2cc
Create data
meghanagrandhi Apr 13, 2026
ce53e9b
Update README.md
meghanagrandhi Apr 13, 2026
af20e4a
Merge pull request #1 from meghanagrandhi/patch-8
meghanagrandhi Apr 13, 2026
6a459b4
Merge pull request #2 from meghanagrandhi/patch-7
meghanagrandhi Apr 13, 2026
0b17dbf
Merge pull request #3 from meghanagrandhi/patch-6
meghanagrandhi Apr 13, 2026
f941b0b
Merge pull request #4 from meghanagrandhi/patch-5
meghanagrandhi Apr 13, 2026
585a8e2
Merge pull request #5 from meghanagrandhi/patch-4
meghanagrandhi Apr 13, 2026
4e1bc25
Merge pull request #6 from meghanagrandhi/patch-3
meghanagrandhi Apr 13, 2026
f2ef7d6
Merge pull request #7 from meghanagrandhi/patch-2
meghanagrandhi Apr 13, 2026
f492144
Merge pull request #8 from meghanagrandhi/patch-1
meghanagrandhi Apr 13, 2026
fa198f0
Add files via upload
meghanagrandhi Apr 13, 2026
5710d71
Add files via upload
meghanagrandhi Apr 13, 2026
cc49965
Delete users.csv
meghanagrandhi Apr 13, 2026
4843aad
Rename users (1).csv to users .csv
meghanagrandhi Apr 13, 2026
6dc6f16
Delete data
meghanagrandhi Apr 13, 2026
b2d9f7f
Rename users .csv to data/users .csv
meghanagrandhi Apr 13, 2026
67fd239
Delete data directory
meghanagrandhi Apr 13, 2026
505a653
Rename ratings.csv to data/ratings.csv
meghanagrandhi Apr 13, 2026
299843f
Rename restaurants.csv to data/restaurants.csv
meghanagrandhi Apr 14, 2026
b9f6f42
Add files via upload
meghanagrandhi Apr 14, 2026
6862eea
Delete data directory
meghanagrandhi Apr 16, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,4 @@ Run tests:
```bash
pytest tests/ -v
```
test123
1 change: 0 additions & 1 deletion data/ratings.csv

This file was deleted.

1 change: 0 additions & 1 deletion data/restaurants.csv

This file was deleted.

1 change: 0 additions & 1 deletion data/users.csv

This file was deleted.

146 changes: 146 additions & 0 deletions src/data_loader.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,148 @@
"""Data loading utilities for TasteVector."""
"""
data_loader.py — Data Ingestion & Validation
============================================
Person 1 | TasteVector Project

Single point of contact with raw CSV files.
Reads restaurants.csv, users.csv, ratings.csv into clean Pandas DataFrames.
All other modules receive DataFrames or NumPy arrays — never raw CSV paths.
"""

import os
import pandas as pd


# ── Expected schema ──────────────────────────────────────────────────────────

RESTAURANT_COLS = {"restaurant_id", "name", "cuisine", "price",
"spice", "distance_km", "veg_friendly"}

USER_COLS = {"user_id", "name", "preferred_cuisine",
"max_price", "spice_tolerance", "max_distance"}

RATING_COLS = {"user_id", "restaurant_id", "rating"}


# ── Loaders ──────────────────────────────────────────────────────────────────

def load_restaurants(data_dir: str) -> pd.DataFrame:
"""
Load and validate restaurants.csv.

Columns: restaurant_id, name, cuisine, price (1-5), spice (1-5),
distance_km (float), veg_friendly (0 or 1)
"""
path = os.path.join(data_dir, "restaurants.csv")
df = pd.read_csv(path)

_check_columns(df, RESTAURANT_COLS, "restaurants.csv")

# Type coercions
df["restaurant_id"] = pd.to_numeric(df["restaurant_id"], errors="coerce")
df["price"] = pd.to_numeric(df["price"], errors="coerce")
df["spice"] = pd.to_numeric(df["spice"], errors="coerce")
df["distance_km"] = pd.to_numeric(df["distance_km"], errors="coerce")
df["veg_friendly"] = pd.to_numeric(df["veg_friendly"], errors="coerce")

before = len(df)
df = df.dropna(subset=list(RESTAURANT_COLS))
_warn_dropped(before, len(df), "restaurants.csv")

df = df.reset_index(drop=True)
return df


def load_users(data_dir: str) -> pd.DataFrame:
"""
Load and validate users.csv.

Columns: user_id, name, preferred_cuisine, max_price (1-5),
spice_tolerance (1-5), max_distance (float)
"""
path = os.path.join(data_dir, "users.csv")
df = pd.read_csv(path)

_check_columns(df, USER_COLS, "users.csv")

df["user_id"] = pd.to_numeric(df["user_id"], errors="coerce")
df["max_price"] = pd.to_numeric(df["max_price"], errors="coerce")
df["spice_tolerance"] = pd.to_numeric(df["spice_tolerance"], errors="coerce")
df["max_distance"] = pd.to_numeric(df["max_distance"], errors="coerce")

before = len(df)
df = df.dropna(subset=list(USER_COLS))
_warn_dropped(before, len(df), "users.csv")

df = df.reset_index(drop=True)
return df


def load_ratings(data_dir: str) -> pd.DataFrame:
"""
Load and validate ratings.csv.

Columns: user_id, restaurant_id, rating (1.0 - 5.0)
"""
path = os.path.join(data_dir, "ratings.csv")
df = pd.read_csv(path)

_check_columns(df, RATING_COLS, "ratings.csv")

df["user_id"] = pd.to_numeric(df["user_id"], errors="coerce")
df["restaurant_id"] = pd.to_numeric(df["restaurant_id"], errors="coerce")
df["rating"] = pd.to_numeric(df["rating"], errors="coerce")

before = len(df)
df = df.dropna(subset=list(RATING_COLS))
# Clamp ratings to valid range
df = df[(df["rating"] >= 1.0) & (df["rating"] <= 5.0)]
_warn_dropped(before, len(df), "ratings.csv")

df = df.reset_index(drop=True)
return df


def load_all(data_dir: str) -> tuple:
"""
Load all three CSVs at once.

Returns
-------
(restaurants, users, ratings) as clean DataFrames
"""
restaurants = load_restaurants(data_dir)
users = load_users(data_dir)
ratings = load_ratings(data_dir)
return restaurants, users, ratings


# ── Helpers ──────────────────────────────────────────────────────────────────

def _check_columns(df: pd.DataFrame, required: set, filename: str) -> None:
missing = required - set(df.columns)
if missing:
raise ValueError(f"{filename} is missing required columns: {missing}")


def _warn_dropped(before: int, after: int, filename: str) -> None:
dropped = before - after
if dropped > 0:
print(f"[data_loader] WARNING: dropped {dropped} malformed "
f"row(s) from {filename}")


# ── Smoke test ───────────────────────────────────────────────────────────────

if __name__ == "__main__":
import os, sys
base = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
restaurants, users, ratings = load_all(os.path.join(base, "data"))

print(f"Restaurants : {len(restaurants)} rows")
print(restaurants.head(3).to_string(index=False))
print(f"\nUsers : {len(users)} rows")
print(users.head(3).to_string(index=False))
print(f"\nRatings : {len(ratings)} rows")
print(ratings.head(5).to_string(index=False))

215 changes: 215 additions & 0 deletions src/eigen_decomp.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,217 @@
"""Eigendecomposition and diagonalization utilities."""

"""
eigen_decomp.py — Eigendecomposition & Diagonalization
=======================================================
Person 3 | TasteVector Project

Computes the covariance matrix C = F^T F of the restaurant feature matrix,
then finds eigenvalues and eigenvectors:

C = P D P^{-1} (diagonalization)

where D = diag(eigenvalues) and P = matrix of eigenvectors (columns).

The top eigenvectors capture the feature directions along which restaurants
vary most (analogous to PCA principal components). These are used to
project user preference vectors into the most informative subspace.

Also validates the Cayley-Hamilton theorem as a correctness check:
substituting eigenvalue λ into the characteristic polynomial p(λ) → 0
"""

import numpy as np


# ── Covariance matrix ────────────────────────────────────────────────────────

def covariance_matrix(F: np.ndarray) -> np.ndarray:
"""
Compute the feature covariance matrix C = F^T @ F.

F : (n_restaurants, n_features)
C : (n_features, n_features) — symmetric, positive semi-definite
"""
return F.T @ F


# ── Eigendecomposition ───────────────────────────────────────────────────────

def eigen_decompose(C: np.ndarray) -> tuple:
"""
Compute eigenvalues and eigenvectors of a square matrix C.

C v = λ v

Returns
-------
eigenvalues : (n,) — may be complex for non-symmetric matrices
(C = F^T F is always symmetric → always real)
eigenvectors : (n, n) — columns are eigenvectors
"""
eigenvalues, eigenvectors = np.linalg.eig(C)

# sort descending by magnitude so the most important directions come first
order = np.argsort(np.abs(eigenvalues))[::-1]
return eigenvalues[order], eigenvectors[:, order]


# ── Diagonalization C = P D P^{-1} ─────────────────────────────────────────

def diagonalize(C: np.ndarray) -> tuple:
"""
Diagonalize C = P D P^{-1}.

Returns
-------
P : (n, n) matrix whose columns are eigenvectors
D : (n, n) diagonal matrix of eigenvalues
P_inv: (n, n) inverse of P
"""
eigenvalues, P = eigen_decompose(C)
D = np.diag(eigenvalues)
P_inv = np.linalg.inv(P)
return P, D, P_inv


def verify_diagonalization(C: np.ndarray, P, D, P_inv,
tol: float = 1e-8) -> bool:
"""
Check P @ D @ P^{-1} ≈ C within floating-point tolerance.
Returns True if the reconstruction is accurate.
"""
C_reconstructed = P @ D @ P_inv
return bool(np.allclose(C_reconstructed, C, atol=tol))


# ── Cayley-Hamilton theorem ──────────────────────────────────────────────────

def cayley_hamilton_check(C: np.ndarray, tol: float = 1e-6) -> dict:
"""
Cayley-Hamilton: every matrix satisfies its own characteristic polynomial.

The characteristic polynomial of C is det(C - λI) = 0.
We compute the coefficients via numpy and evaluate p(C) — if the theorem
holds, the result should be the zero matrix (within floating-point error).

For each eigenvalue λ_i we also verify p(λ_i) ≈ 0.

Returns
-------
dict with keys:
'matrix_check' : bool — p(C) ≈ 0 (Frobenius norm < tol * n^2)
'eigenvalue_residuals' : array of |p(λ_i)| for each eigenvalue
'all_pass' : bool
"""
n = C.shape[0]
coeffs = np.poly(C) # characteristic polynomial coefficients

# Evaluate p(C) using Horner's method
pC = np.zeros_like(C, dtype=complex)
for c in coeffs:
pC = pC @ C + c * np.eye(n)

matrix_frobenius = np.linalg.norm(pC)
matrix_ok = bool(matrix_frobenius < tol * n * n)

# Evaluate p(λ) for each eigenvalue
eigenvalues, _ = np.linalg.eig(C)
residuals = np.array([abs(np.polyval(coeffs, lam)) for lam in eigenvalues])
eigenvalue_ok = bool(np.all(residuals < tol))

return {
"matrix_check": matrix_ok,
"matrix_frobenius_norm": float(matrix_frobenius),
"eigenvalue_residuals": residuals,
"all_pass": matrix_ok and eigenvalue_ok,
}


# ── Top eigenvectors (PCA-style projection) ──────────────────────────────────

def top_k_eigenvectors(C: np.ndarray, k: int) -> np.ndarray:
"""
Return the k eigenvectors corresponding to the k largest eigenvalues.
These span the directions of maximum variance in the feature space.

Returns
-------
E : (n_features, k) — columns are the top-k eigenvectors
"""
eigenvalues, eigenvectors = eigen_decompose(C)
return eigenvectors[:, :k]


def project_onto_top_k(v: np.ndarray, E: np.ndarray) -> np.ndarray:
"""
Project a preference vector v onto the subspace spanned by E.

v_proj = E @ E^T @ v

Parameters
----------
v : (n_features,) — user preference vector
E : (n_features, k) — top-k eigenvectors from top_k_eigenvectors()

Returns
-------
v_proj : (n_features,) — projection of v onto the top-k subspace
"""
return E @ (E.T @ v)


# ── Analysis report ──────────────────────────────────────────────────────────

def eigen_report(F: np.ndarray, feature_names: list = None) -> None:
"""
Print a human-readable eigendecomposition report for the feature matrix F.
"""
C = covariance_matrix(F)
eigenvalues, eigenvectors = eigen_decompose(C)
P, D, P_inv = diagonalize(C)

if feature_names is None:
feature_names = [f"feature_{i}" for i in range(F.shape[1])]

print("=" * 55)
print("EIGENDECOMPOSITION REPORT")
print("=" * 55)
print(f"\nCovariance matrix C = F^T F shape: {C.shape}")

total = np.abs(eigenvalues).sum()
print("\nEigenvalues (sorted descending):")
for i, (lam, vec) in enumerate(zip(eigenvalues, eigenvectors.T)):
pct = 100 * abs(lam) / total
top_feat = feature_names[np.argmax(np.abs(vec))]
print(f" λ_{i+1} = {lam.real:8.3f} ({pct:5.1f}%) "
f"dominant feature: {top_feat}")

diag_ok = verify_diagonalization(C, P, D, P_inv)
print(f"\nDiagonalization C = P D P^{{-1}} verified: {diag_ok}")

ch = cayley_hamilton_check(C)
print(f"Cayley-Hamilton check passed: {ch['all_pass']}")
print(f" p(C) Frobenius norm: {ch['matrix_frobenius_norm']:.2e}")


# ── Smoke test ───────────────────────────────────────────────────────────────

if __name__ == "__main__":
import os, sys
base = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, os.path.join(base, "src"))
from data_loader import load_all

restaurants, _, _ = load_all(os.path.join(base, "data"))
F = restaurants[["price", "spice", "distance_km", "veg_friendly"]].to_numpy(dtype=float)
feature_names = ["price", "spice", "distance_km", "veg_friendly"]

eigen_report(F, feature_names)

C = covariance_matrix(F)
E = top_k_eigenvectors(C, k=2)
print(f"\nTop-2 eigenvectors shape: {E.shape}")
v = np.array([3.0, 4.0, 2.0, 1.0]) # example user preference vector
v_proj = project_onto_top_k(v, E)
print(f"User pref vector: {v}")
print(f"Projected onto k=2 subspace: {np.round(v_proj, 4)}")
Loading