From 75d383f2f1e452a6732ebaabd74f8fd897341b0b Mon Sep 17 00:00:00 2001
From: NeurArk <guillaume.rospape@neurark.com>
Date: Thu, 22 May 2025 16:59:50 +0200
Subject: [PATCH] Add caching for model training and transformations

---
 tests/test_model.py     |  9 ++++++++
 tests/test_transform.py | 25 ++++++++++++++++++++
 utils/components.py     | 51 +++++++++++++++++++++++++++++------------
 utils/model.py          |  8 +++++++
 4 files changed, 78 insertions(+), 15 deletions(-)

diff --git a/tests/test_model.py b/tests/test_model.py
index ad2760e..32f7edc 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -66,6 +66,15 @@ def custom_train(x, y):
     assert first is second
 
 
+def test_training_function_caching():
+    df = sample_reg_df()
+    X = df.drop(columns=["target"])
+    y = df["target"]
+    first = model.train_linear_regression(X, y)
+    second = model.train_linear_regression(X, y)
+    assert first is second
+
+
 def test_regression_training_functions():
     df = sample_reg_df()
     X_train, X_test, y_train, y_test = model.train_test_split_data(df, "target")
diff --git a/tests/test_transform.py b/tests/test_transform.py
index 1f13fbe..73918de 100644
--- a/tests/test_transform.py
+++ b/tests/test_transform.py
@@ -67,3 +67,28 @@ def test_scale_features_non_numeric():
     df = sample_df().fillna({"cat": "b"})
     with pytest.raises(TypeError):
         transform.scale_features(df, ["cat"], method="standard")
+
+
+def test_cached_transformations_identical():
+    import streamlit as st
+    from utils import components
+
+    df = sample_df()
+    st.session_state.clear()
+    first = components._cached_transformations(
+        df,
+        "Fill Mean",
+        tuple(),
+        "One-Hot",
+        tuple(),
+        "Standard",
+    )
+    second = components._cached_transformations(
+        df,
+        "Fill Mean",
+        tuple(),
+        "One-Hot",
+        tuple(),
+        "Standard",
+    )
+    assert first is second
diff --git a/utils/components.py b/utils/components.py
index d090871..e68167c 100644
--- a/utils/components.py
+++ b/utils/components.py
@@ -182,6 +182,34 @@ def visualization_section(data: pd.DataFrame) -> None:
         st.write(f"- {insight}")
 
 
+@st.cache_resource
+def _cached_transformations(
+    df: pd.DataFrame,
+    missing_strategy: str,
+    encode_cols: tuple[str, ...],
+    encode_method: str,
+    scale_cols: tuple[str, ...],
+    scale_method: str,
+) -> pd.DataFrame:
+    """Return transformed DataFrame with caching."""
+    df_trans = df.copy()
+    if missing_strategy == "Drop rows":
+        df_trans = transform.handle_missing_values(df_trans, strategy="drop")
+    elif missing_strategy == "Fill Mean":
+        df_trans = transform.handle_missing_values(df_trans, strategy="mean")
+    elif missing_strategy == "Fill Median":
+        df_trans = transform.handle_missing_values(df_trans, strategy="median")
+    elif missing_strategy == "Fill Mode":
+        df_trans = transform.handle_missing_values(df_trans, strategy="mode")
+    if encode_cols:
+        method = "onehot" if encode_method == "One-Hot" else "label"
+        df_trans = transform.encode_features(df_trans, list(encode_cols), method=method)
+    if scale_cols:
+        method = "standard" if scale_method == "Standard" else "minmax"
+        df_trans = transform.scale_features(df_trans, list(scale_cols), method=method)
+    return df_trans
+
+
 def transformation_section(data: pd.DataFrame) -> pd.DataFrame:
     """Provide UI to apply common data transformations."""
     st.subheader("Data Transformation")
@@ -207,22 +235,15 @@ def transformation_section(data: pd.DataFrame) -> pd.DataFrame:
             ["Standard", "Min-Max"],
         )
     if st.button("Apply Transformations"):
-        df_trans = data.copy()
         try:
-            if missing_strategy == "Drop rows":
-                df_trans = transform.handle_missing_values(df_trans, strategy="drop")
-            elif missing_strategy == "Fill Mean":
-                df_trans = transform.handle_missing_values(df_trans, strategy="mean")
-            elif missing_strategy == "Fill Median":
-                df_trans = transform.handle_missing_values(df_trans, strategy="median")
-            elif missing_strategy == "Fill Mode":
-                df_trans = transform.handle_missing_values(df_trans, strategy="mode")
-            if encode_cols:
-                method = "onehot" if encode_method == "One-Hot" else "label"
-                df_trans = transform.encode_features(df_trans, encode_cols, method=method)
-            if scale_cols:
-                method = "standard" if scale_method == "Standard" else "minmax"
-                df_trans = transform.scale_features(df_trans, scale_cols, method=method)
+            df_trans = _cached_transformations(
+                data,
+                missing_strategy,
+                tuple(encode_cols),
+                encode_method,
+                tuple(scale_cols),
+                scale_method,
+            )
         except (ValueError, KeyError, TypeError) as exc:
             st.error(f"Transformation error: {exc}")
         else:
diff --git a/utils/model.py b/utils/model.py
index 73a07fe..2a48d32 100644
--- a/utils/model.py
+++ b/utils/model.py
@@ -8,6 +8,7 @@
 
 import pandas as pd
 import numpy as np
+import streamlit as st
 from joblib import dump
 from sklearn.model_selection import train_test_split, cross_val_score
 from sklearn.linear_model import LogisticRegression, LinearRegression
@@ -66,6 +67,7 @@ def train_test_split_data(
     )
 
 
+@st.cache_resource
 @cache_model
 def train_logistic_regression(
     X: pd.DataFrame,
@@ -80,6 +82,7 @@ def train_logistic_regression(
     return model
 
 
+@st.cache_resource
 @cache_model
 def train_random_forest_classifier(
     X: pd.DataFrame,
@@ -99,6 +102,7 @@ def train_random_forest_classifier(
     return model
 
 
+@st.cache_resource
 @cache_model
 def train_xgboost_classifier(
     X: pd.DataFrame,
@@ -143,6 +147,7 @@ def detect_problem_type(y: pd.Series) -> str:
     return "classification"
 
 
+@st.cache_resource
 @cache_model
 def train_linear_regression(
     X: pd.DataFrame,
@@ -154,6 +159,7 @@ def train_linear_regression(
     return model
 
 
+@st.cache_resource
 @cache_model
 def train_decision_tree_regressor(
     X: pd.DataFrame,
@@ -168,6 +174,7 @@ def train_decision_tree_regressor(
     return model
 
 
+@st.cache_resource
 @cache_model
 def train_random_forest_regressor(
     X: pd.DataFrame,
@@ -187,6 +194,7 @@ def train_random_forest_regressor(
     return model
 
 
+@st.cache_resource
 @cache_model
 def train_xgboost_regressor(
     X: pd.DataFrame,