From 75d383f2f1e452a6732ebaabd74f8fd897341b0b Mon Sep 17 00:00:00 2001 From: NeurArk Date: Thu, 22 May 2025 16:59:50 +0200 Subject: [PATCH] Add caching for model training and transformations --- tests/test_model.py | 9 ++++++++ tests/test_transform.py | 25 ++++++++++++++++++++ utils/components.py | 51 +++++++++++++++++++++++++++++------------ utils/model.py | 8 +++++++ 4 files changed, 78 insertions(+), 15 deletions(-) diff --git a/tests/test_model.py b/tests/test_model.py index ad2760e..32f7edc 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -66,6 +66,15 @@ def custom_train(x, y): assert first is second +def test_training_function_caching(): + df = sample_reg_df() + X = df.drop(columns=["target"]) + y = df["target"] + first = model.train_linear_regression(X, y) + second = model.train_linear_regression(X, y) + assert first is second + + def test_regression_training_functions(): df = sample_reg_df() X_train, X_test, y_train, y_test = model.train_test_split_data(df, "target") diff --git a/tests/test_transform.py b/tests/test_transform.py index 1f13fbe..73918de 100644 --- a/tests/test_transform.py +++ b/tests/test_transform.py @@ -67,3 +67,28 @@ def test_scale_features_non_numeric(): df = sample_df().fillna({"cat": "b"}) with pytest.raises(TypeError): transform.scale_features(df, ["cat"], method="standard") + + +def test_cached_transformations_identical(): + import streamlit as st + from utils import components + + df = sample_df() + st.session_state.clear() + first = components._cached_transformations( + df, + "Fill Mean", + tuple(), + "One-Hot", + tuple(), + "Standard", + ) + second = components._cached_transformations( + df, + "Fill Mean", + tuple(), + "One-Hot", + tuple(), + "Standard", + ) + assert first is second diff --git a/utils/components.py b/utils/components.py index d090871..e68167c 100644 --- a/utils/components.py +++ b/utils/components.py @@ -182,6 +182,34 @@ def visualization_section(data: pd.DataFrame) -> None: st.write(f"- {insight}") +@st.cache_resource +def _cached_transformations( + df: pd.DataFrame, + missing_strategy: str, + encode_cols: tuple[str, ...], + encode_method: str, + scale_cols: tuple[str, ...], + scale_method: str, +) -> pd.DataFrame: + """Return transformed DataFrame with caching.""" + df_trans = df.copy() + if missing_strategy == "Drop rows": + df_trans = transform.handle_missing_values(df_trans, strategy="drop") + elif missing_strategy == "Fill Mean": + df_trans = transform.handle_missing_values(df_trans, strategy="mean") + elif missing_strategy == "Fill Median": + df_trans = transform.handle_missing_values(df_trans, strategy="median") + elif missing_strategy == "Fill Mode": + df_trans = transform.handle_missing_values(df_trans, strategy="mode") + if encode_cols: + method = "onehot" if encode_method == "One-Hot" else "label" + df_trans = transform.encode_features(df_trans, list(encode_cols), method=method) + if scale_cols: + method = "standard" if scale_method == "Standard" else "minmax" + df_trans = transform.scale_features(df_trans, list(scale_cols), method=method) + return df_trans + + def transformation_section(data: pd.DataFrame) -> pd.DataFrame: """Provide UI to apply common data transformations.""" st.subheader("Data Transformation") @@ -207,22 +235,15 @@ def transformation_section(data: pd.DataFrame) -> pd.DataFrame: ["Standard", "Min-Max"], ) if st.button("Apply Transformations"): - df_trans = data.copy() try: - if missing_strategy == "Drop rows": - df_trans = transform.handle_missing_values(df_trans, strategy="drop") - elif missing_strategy == "Fill Mean": - df_trans = transform.handle_missing_values(df_trans, strategy="mean") - elif missing_strategy == "Fill Median": - df_trans = transform.handle_missing_values(df_trans, strategy="median") - elif missing_strategy == "Fill Mode": - df_trans = transform.handle_missing_values(df_trans, strategy="mode") - if encode_cols: - method = "onehot" if encode_method == "One-Hot" else "label" - df_trans = transform.encode_features(df_trans, encode_cols, method=method) - if scale_cols: - method = "standard" if scale_method == "Standard" else "minmax" - df_trans = transform.scale_features(df_trans, scale_cols, method=method) + df_trans = _cached_transformations( + data, + missing_strategy, + tuple(encode_cols), + encode_method, + tuple(scale_cols), + scale_method, + ) except (ValueError, KeyError, TypeError) as exc: st.error(f"Transformation error: {exc}") else: diff --git a/utils/model.py b/utils/model.py index 73a07fe..2a48d32 100644 --- a/utils/model.py +++ b/utils/model.py @@ -8,6 +8,7 @@ import pandas as pd import numpy as np +import streamlit as st from joblib import dump from sklearn.model_selection import train_test_split, cross_val_score from sklearn.linear_model import LogisticRegression, LinearRegression @@ -66,6 +67,7 @@ def train_test_split_data( ) +@st.cache_resource @cache_model def train_logistic_regression( X: pd.DataFrame, @@ -80,6 +82,7 @@ def train_logistic_regression( return model +@st.cache_resource @cache_model def train_random_forest_classifier( X: pd.DataFrame, @@ -99,6 +102,7 @@ def train_random_forest_classifier( return model +@st.cache_resource @cache_model def train_xgboost_classifier( X: pd.DataFrame, @@ -143,6 +147,7 @@ def detect_problem_type(y: pd.Series) -> str: return "classification" +@st.cache_resource @cache_model def train_linear_regression( X: pd.DataFrame, @@ -154,6 +159,7 @@ def train_linear_regression( return model +@st.cache_resource @cache_model def train_decision_tree_regressor( X: pd.DataFrame, @@ -168,6 +174,7 @@ def train_decision_tree_regressor( return model +@st.cache_resource @cache_model def train_random_forest_regressor( X: pd.DataFrame, @@ -187,6 +194,7 @@ def train_random_forest_regressor( return model +@st.cache_resource @cache_model def train_xgboost_regressor( X: pd.DataFrame,