From 7d23f871e322c14a26293282bbe20686a81563f7 Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Sun, 22 Mar 2026 00:29:30 +0530 Subject: [PATCH 01/26] added test case `def test_raises_non_fitted_error_when_error_during_fit(estimator):` --- .../test_check_estimator_imputers.py | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/tests/test_imputation/test_check_estimator_imputers.py b/tests/test_imputation/test_check_estimator_imputers.py index 0091c7bf7..6a563fef4 100644 --- a/tests/test_imputation/test_check_estimator_imputers.py +++ b/tests/test_imputation/test_check_estimator_imputers.py @@ -1,3 +1,7 @@ +from numpy import nan +from sklearn import clone +from sklearn.exceptions import NotFittedError + import pandas as pd import pytest import sklearn @@ -69,3 +73,29 @@ def test_transformers_in_pipeline_with_set_output_pandas(transformer): Xtp = pipe.fit_transform(X, y) pd.testing.assert_frame_equal(Xtt, Xtp) + + +@pytest.mark.parametrize("estimator", _estimators) +def test_raises_non_fitted_error_when_error_during_fit(estimator): + estimator = clone(estimator) + + if estimator.__class__.__name__ in [ + "MeanMedianImputer", + "EndTailImputer", + "ArbitraryNumberImputer", + ]: + X = pd.DataFrame({"cat1": ["a", "b", "c", "a", "b"]}) + + elif estimator.__class__.__name__ == "CategoricalImputer": + estimator.set_params(ignore_format=False) + X = pd.DataFrame({"num1": [1.0, 2.0, 3.0, 4.0, 5.0]}) + + else: + X = pd.DataFrame() + + with pytest.raises((ValueError, TypeError)): + estimator.fit(X) + + with pytest.raises(NotFittedError): + estimator.transform(X) + \ No newline at end of file From e1439cbd2d124e647517c4a74a0a9f71cd213a74 Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Sun, 22 Mar 2026 01:03:17 +0530 Subject: [PATCH 02/26] fixing test case test_style --- tests/test_imputation/test_check_estimator_imputers.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/test_imputation/test_check_estimator_imputers.py b/tests/test_imputation/test_check_estimator_imputers.py index 6a563fef4..95a312de7 100644 --- a/tests/test_imputation/test_check_estimator_imputers.py +++ b/tests/test_imputation/test_check_estimator_imputers.py @@ -1,4 +1,3 @@ -from numpy import nan from sklearn import clone from sklearn.exceptions import NotFittedError @@ -97,5 +96,4 @@ def test_raises_non_fitted_error_when_error_during_fit(estimator): estimator.fit(X) with pytest.raises(NotFittedError): - estimator.transform(X) - \ No newline at end of file + estimator.transform(X) \ No newline at end of file From f33a85addd64fd700bdcef5ace07b60d5905b998 Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Sun, 22 Mar 2026 01:06:11 +0530 Subject: [PATCH 03/26] fixing test case test_style --- tests/test_imputation/test_check_estimator_imputers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_imputation/test_check_estimator_imputers.py b/tests/test_imputation/test_check_estimator_imputers.py index 95a312de7..9d2ebb183 100644 --- a/tests/test_imputation/test_check_estimator_imputers.py +++ b/tests/test_imputation/test_check_estimator_imputers.py @@ -96,4 +96,5 @@ def test_raises_non_fitted_error_when_error_during_fit(estimator): estimator.fit(X) with pytest.raises(NotFittedError): - estimator.transform(X) \ No newline at end of file + estimator.transform(X) + \ No newline at end of file From 7a0e8f5f6fe19dc79d67536b4a7fe9977beec0a5 Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Sun, 22 Mar 2026 01:07:17 +0530 Subject: [PATCH 04/26] fixing test case test_style --- tests/test_imputation/test_check_estimator_imputers.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_imputation/test_check_estimator_imputers.py b/tests/test_imputation/test_check_estimator_imputers.py index 9d2ebb183..e9017309a 100644 --- a/tests/test_imputation/test_check_estimator_imputers.py +++ b/tests/test_imputation/test_check_estimator_imputers.py @@ -97,4 +97,3 @@ def test_raises_non_fitted_error_when_error_during_fit(estimator): with pytest.raises(NotFittedError): estimator.transform(X) - \ No newline at end of file From 59c8bd1168265892d2e42990c8adf862857949c0 Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Sun, 22 Mar 2026 15:25:18 +0530 Subject: [PATCH 05/26] fix: defer trailing underscore attribute assignment in fit() for imputers and discretisers (closes #586) --- .../_base_transformers/base_numerical.py | 33 ++++++++++++------- .../discretisation/equal_frequency.py | 12 ++++--- feature_engine/discretisation/equal_width.py | 12 ++++--- feature_engine/imputation/arbitrary_number.py | 12 ++++--- feature_engine/imputation/end_tail.py | 26 ++++++++------- feature_engine/imputation/mean_median.py | 10 +++--- 6 files changed, 65 insertions(+), 40 deletions(-) diff --git a/feature_engine/_base_transformers/base_numerical.py b/feature_engine/_base_transformers/base_numerical.py index 60212f3d6..02507a226 100644 --- a/feature_engine/_base_transformers/base_numerical.py +++ b/feature_engine/_base_transformers/base_numerical.py @@ -28,6 +28,26 @@ class BaseNumericalTransformer( variable transformers, discretisers, math combination. """ + def _fit_setup(self, X: pd.DataFrame): + """ + Check dataframe, find numerical variables, check for NA and Inf. + Returns the checked dataframe and the correctly identified numerical variables. + """ + # check input dataframe + X = check_X(X) + + # find or check for numerical variables + if self.variables is None: + variables_ = find_numerical_variables(X) + else: + variables_ = check_numerical_variables(X, self.variables) + + # check if dataset contains na or inf + _check_contains_na(X, variables_) + _check_contains_inf(X, variables_) + + return X, variables_ + def fit(self, X: pd.DataFrame) -> pd.DataFrame: """ Checks that input is a dataframe, finds numerical variables, or alternatively @@ -55,18 +75,9 @@ def fit(self, X: pd.DataFrame) -> pd.DataFrame: The same dataframe entered as parameter """ - # check input dataframe - X = check_X(X) - - # find or check for numerical variables - if self.variables is None: - self.variables_ = find_numerical_variables(X) - else: - self.variables_ = check_numerical_variables(X, self.variables) + X, variables_ = self._fit_setup(X) - # check if dataset contains na or inf - _check_contains_na(X, self.variables_) - _check_contains_inf(X, self.variables_) + self.variables_ = variables_ # save input features self.feature_names_in_ = X.columns.tolist() diff --git a/feature_engine/discretisation/equal_frequency.py b/feature_engine/discretisation/equal_frequency.py index 9060f1d49..caaa0cfcc 100644 --- a/feature_engine/discretisation/equal_frequency.py +++ b/feature_engine/discretisation/equal_frequency.py @@ -159,17 +159,21 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ # check input dataframe - X = super().fit(X) + X, variables_ = self._fit_setup(X) - self.binner_dict_ = {} + binner_dict_ = {} - for var in self.variables_: + for var in variables_: tmp, bins = pd.qcut(x=X[var], q=self.q, retbins=True, duplicates="drop") # Prepend/Append infinities to accommodate outliers bins = list(bins) bins[0] = float("-inf") bins[len(bins) - 1] = float("inf") - self.binner_dict_[var] = bins + binner_dict_[var] = bins + self.binner_dict_ = binner_dict_ + self.variables_ = variables_ + self.feature_names_in_ = X.columns.tolist() + self.n_features_in_ = X.shape[1] return self diff --git a/feature_engine/discretisation/equal_width.py b/feature_engine/discretisation/equal_width.py index 03787835d..d26889545 100644 --- a/feature_engine/discretisation/equal_width.py +++ b/feature_engine/discretisation/equal_width.py @@ -168,12 +168,12 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ # check input dataframe - X = super().fit(X) + X, variables_ = self._fit_setup(X) # fit - self.binner_dict_ = {} + binner_dict_ = {} - for var in self.variables_: + for var in variables_: tmp, bins = pd.cut( x=X[var], bins=self.bins, @@ -186,6 +186,10 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): bins = list(bins) bins[0] = float("-inf") bins[len(bins) - 1] = float("inf") - self.binner_dict_[var] = bins + binner_dict_[var] = bins + self.binner_dict_ = binner_dict_ + self.variables_ = variables_ + self.feature_names_in_ = X.columns.tolist() + self.n_features_in_ = X.shape[1] return self diff --git a/feature_engine/imputation/arbitrary_number.py b/feature_engine/imputation/arbitrary_number.py index 668f391b0..e16c62ae2 100644 --- a/feature_engine/imputation/arbitrary_number.py +++ b/feature_engine/imputation/arbitrary_number.py @@ -149,17 +149,19 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): # find or check for numerical variables # create the imputer dictionary if self.imputer_dict: - self.variables_ = check_numerical_variables( + variables_ = check_numerical_variables( X, list(self.imputer_dict.keys()) ) - self.imputer_dict_ = self.imputer_dict + imputer_dict_ = self.imputer_dict else: if self.variables is None: - self.variables_ = find_numerical_variables(X) + variables_ = find_numerical_variables(X) else: - self.variables_ = check_numerical_variables(X, self.variables) - self.imputer_dict_ = {var: self.arbitrary_number for var in self.variables_} + variables_ = check_numerical_variables(X, self.variables) + imputer_dict_ = {var: self.arbitrary_number for var in variables_} + self.variables_ = variables_ + self.imputer_dict_ = imputer_dict_ self._get_feature_names_in(X) return self diff --git a/feature_engine/imputation/end_tail.py b/feature_engine/imputation/end_tail.py index 59e59f32a..a5ffda663 100644 --- a/feature_engine/imputation/end_tail.py +++ b/feature_engine/imputation/end_tail.py @@ -177,35 +177,37 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): # find or check for numerical variables if self.variables is None: - self.variables_ = find_numerical_variables(X) + variables_ = find_numerical_variables(X) else: - self.variables_ = check_numerical_variables(X, self.variables) + variables_ = check_numerical_variables(X, self.variables) # estimate imputation values if self.imputation_method == "max": - self.imputer_dict_ = (X[self.variables_].max() * self.fold).to_dict() + imputer_dict_ = (X[variables_].max() * self.fold).to_dict() elif self.imputation_method == "gaussian": if self.tail == "right": - self.imputer_dict_ = ( - X[self.variables_].mean() + self.fold * X[self.variables_].std() + imputer_dict_ = ( + X[variables_].mean() + self.fold * X[variables_].std() ).to_dict() elif self.tail == "left": - self.imputer_dict_ = ( - X[self.variables_].mean() - self.fold * X[self.variables_].std() + imputer_dict_ = ( + X[variables_].mean() - self.fold * X[variables_].std() ).to_dict() elif self.imputation_method == "iqr": - IQR = X[self.variables_].quantile(0.75) - X[self.variables_].quantile(0.25) + IQR = X[variables_].quantile(0.75) - X[variables_].quantile(0.25) if self.tail == "right": - self.imputer_dict_ = ( - X[self.variables_].quantile(0.75) + (IQR * self.fold) + imputer_dict_ = ( + X[variables_].quantile(0.75) + (IQR * self.fold) ).to_dict() elif self.tail == "left": - self.imputer_dict_ = ( - X[self.variables_].quantile(0.25) - (IQR * self.fold) + imputer_dict_ = ( + X[variables_].quantile(0.25) - (IQR * self.fold) ).to_dict() + self.variables_ = variables_ + self.imputer_dict_ = imputer_dict_ self._get_feature_names_in(X) return self diff --git a/feature_engine/imputation/mean_median.py b/feature_engine/imputation/mean_median.py index da845e063..3f3baee0a 100644 --- a/feature_engine/imputation/mean_median.py +++ b/feature_engine/imputation/mean_median.py @@ -127,17 +127,19 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): # find or check for numerical variables if self.variables is None: - self.variables_ = find_numerical_variables(X) + variables_ = find_numerical_variables(X) else: - self.variables_ = check_numerical_variables(X, self.variables) + variables_ = check_numerical_variables(X, self.variables) # find imputation parameters: mean or median if self.imputation_method == "mean": - self.imputer_dict_ = X[self.variables_].mean().to_dict() + imputer_dict_ = X[variables_].mean().to_dict() elif self.imputation_method == "median": - self.imputer_dict_ = X[self.variables_].median().to_dict() + imputer_dict_ = X[variables_].median().to_dict() + self.variables_ = variables_ + self.imputer_dict_ = imputer_dict_ self._get_feature_names_in(X) return self From fef5a3f2a4bdeb055034da4e76bae50d4290ac90 Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Fri, 27 Mar 2026 02:39:09 +0530 Subject: [PATCH 06/26] base transformers --- .../_base_transformers/base_numerical.py | 40 +++---------------- feature_engine/_base_transformers/mixins.py | 13 +++--- 2 files changed, 10 insertions(+), 43 deletions(-) diff --git a/feature_engine/_base_transformers/base_numerical.py b/feature_engine/_base_transformers/base_numerical.py index 02507a226..d03cf9065 100644 --- a/feature_engine/_base_transformers/base_numerical.py +++ b/feature_engine/_base_transformers/base_numerical.py @@ -48,44 +48,14 @@ def _fit_setup(self, X: pd.DataFrame): return X, variables_ - def fit(self, X: pd.DataFrame) -> pd.DataFrame: - """ - Checks that input is a dataframe, finds numerical variables, or alternatively - checks that variables entered by the user are of type numerical. - - Parameters - ---------- - X : Pandas DataFrame - - y : Pandas Series, np.array. Default = None - Parameter is necessary for compatibility with sklearn Pipeline. - - Raises - ------ - TypeError - If the input is not a Pandas DataFrame or a numpy array - If any of the user provided variables are not numerical - ValueError - If there are no numerical variables in the df or the df is empty - If the variable(s) contain null values - - Returns - ------- - X : Pandas DataFrame - The same dataframe entered as parameter - """ - - X, variables_ = self._fit_setup(X) - - self.variables_ = variables_ + def _get_feature_names_in(self, X): + """Get the names and number of features in the train set (the dataframe + used during fit).""" - # save input features - self.feature_names_in_ = X.columns.tolist() - - # save train set shape + self.feature_names_in_ = X.columns.to_list() self.n_features_in_ = X.shape[1] - return X + return self def _check_transform_input_and_state(self, X: pd.DataFrame) -> pd.DataFrame: """ diff --git a/feature_engine/_base_transformers/mixins.py b/feature_engine/_base_transformers/mixins.py index 4d4b7d254..cc6d57b6d 100644 --- a/feature_engine/_base_transformers/mixins.py +++ b/feature_engine/_base_transformers/mixins.py @@ -77,17 +77,14 @@ def _fit_from_dict(self, X: pd.DataFrame, user_dict_: Dict) -> pd.DataFrame: # find or check for numerical variables variables = list(user_dict_.keys()) - self.variables_ = check_numerical_variables(X, variables) + variables_ = check_numerical_variables(X, variables) # check if dataset contains na or inf - _check_contains_na(X, self.variables_) - _check_contains_inf(X, self.variables_) + _check_contains_na(X, variables_) + _check_contains_inf(X, variables_) - # save input features - self.feature_names_in_ = X.columns.tolist() - - # save train set shape - self.n_features_in_ = X.shape[1] + self.variables_ = variables_ + self._get_feature_names_in(X) return X From 2d3f7341a7b4a9eed6892d88140d3e740f728dde Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Fri, 27 Mar 2026 02:40:00 +0530 Subject: [PATCH 07/26] discretisation --- feature_engine/discretisation/arbitrary.py | 1 + feature_engine/discretisation/decision_tree.py | 9 ++++++--- feature_engine/discretisation/equal_frequency.py | 3 +-- feature_engine/discretisation/equal_width.py | 3 +-- feature_engine/discretisation/geometric_width.py | 12 ++++++++---- 5 files changed, 17 insertions(+), 11 deletions(-) diff --git a/feature_engine/discretisation/arbitrary.py b/feature_engine/discretisation/arbitrary.py index 44d35ecdf..d7442670f 100644 --- a/feature_engine/discretisation/arbitrary.py +++ b/feature_engine/discretisation/arbitrary.py @@ -155,6 +155,7 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): # for consistency wit the rest of the discretisers, we add this attribute self.binner_dict_ = self.binning_dict + self._get_feature_names_in(X) return self diff --git a/feature_engine/discretisation/decision_tree.py b/feature_engine/discretisation/decision_tree.py index af691e4aa..b75cec5e1 100644 --- a/feature_engine/discretisation/decision_tree.py +++ b/feature_engine/discretisation/decision_tree.py @@ -241,7 +241,7 @@ def fit(self, X: pd.DataFrame, y: pd.Series): # type: ignore check_classification_targets(y) # check input dataframe - X = super().fit(X) + X, variables_ = self._fit_setup(X) if self.param_grid: param_grid = self.param_grid @@ -251,7 +251,7 @@ def fit(self, X: pd.DataFrame, y: pd.Series): # type: ignore binner_dict_ = {} scores_dict_ = {} - for var in self.variables_: + for var in variables_: if self.regression: model = DecisionTreeRegressor(random_state=self.random_state) @@ -269,7 +269,7 @@ def fit(self, X: pd.DataFrame, y: pd.Series): # type: ignore scores_dict_[var] = tree_model.score(X[var].to_frame(), y) if self.bin_output != "prediction": - for var in self.variables_: + for var in variables_: clf = binner_dict_[var].best_estimator_ threshold = clf.tree_.threshold feature = clf.tree_.feature @@ -280,6 +280,9 @@ def fit(self, X: pd.DataFrame, y: pd.Series): # type: ignore self.binner_dict_ = binner_dict_ self.scores_dict_ = scores_dict_ + self.variables_ = variables_ + self._get_feature_names_in(X) + return self def transform(self, X: pd.DataFrame) -> pd.DataFrame: diff --git a/feature_engine/discretisation/equal_frequency.py b/feature_engine/discretisation/equal_frequency.py index caaa0cfcc..841f3edb3 100644 --- a/feature_engine/discretisation/equal_frequency.py +++ b/feature_engine/discretisation/equal_frequency.py @@ -174,6 +174,5 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): self.binner_dict_ = binner_dict_ self.variables_ = variables_ - self.feature_names_in_ = X.columns.tolist() - self.n_features_in_ = X.shape[1] + self._get_feature_names_in(X) return self diff --git a/feature_engine/discretisation/equal_width.py b/feature_engine/discretisation/equal_width.py index d26889545..0bfd2b4bc 100644 --- a/feature_engine/discretisation/equal_width.py +++ b/feature_engine/discretisation/equal_width.py @@ -190,6 +190,5 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): self.binner_dict_ = binner_dict_ self.variables_ = variables_ - self.feature_names_in_ = X.columns.tolist() - self.n_features_in_ = X.shape[1] + self._get_feature_names_in(X) return self diff --git a/feature_engine/discretisation/geometric_width.py b/feature_engine/discretisation/geometric_width.py index 9f7c37d21..7da28261a 100644 --- a/feature_engine/discretisation/geometric_width.py +++ b/feature_engine/discretisation/geometric_width.py @@ -159,12 +159,12 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ # check input dataframe - X = super().fit(X) + X, variables_ = self._fit_setup(X) # fit - self.binner_dict_ = {} + binner_dict_ = {} - for var in self.variables_: + for var in variables_: min_, max_ = X[var].min(), X[var].max() increment = np.power(max_ - min_, 1.0 / self.bins) bins = np.r_[ @@ -172,6 +172,10 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): ] bins = np.sort(bins) bins = list(bins) - self.binner_dict_[var] = bins + binner_dict_[var] = bins + + self.variables_ = variables_ + self.binner_dict_ = binner_dict_ + self._get_feature_names_in(X) return self From f95cb7fddfac06a83ff92bd711a85023bb341794 Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Fri, 27 Mar 2026 02:40:19 +0530 Subject: [PATCH 08/26] scaling --- feature_engine/scaling/mean_normalization.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/feature_engine/scaling/mean_normalization.py b/feature_engine/scaling/mean_normalization.py index 78f4a958c..93c9ae171 100644 --- a/feature_engine/scaling/mean_normalization.py +++ b/feature_engine/scaling/mean_normalization.py @@ -120,18 +120,24 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ # check input dataframe - X = super().fit(X) - self.mean_ = X[self.variables_].mean().to_dict() - self.range_ = (X[self.variables_].max() - X[self.variables_].min()).to_dict() + X, variables_ = self._fit_setup(X) + + mean_ = X[variables_].mean().to_dict() + range_ = (X[variables_].max() - X[variables_].min()).to_dict() # check for constant columns - constant_columns = [col for col, value in self.range_.items() if value == 0] + constant_columns = [col for col, value in range_.items() if value == 0] if constant_columns: raise ValueError( f"The following variable(s) are constant: {constant_columns}. " "Division by zero is not allowed. Please remove constant columns." ) + self.variables_ = variables_ + self.mean_ = mean_ + self.range_ = range_ + self._get_feature_names_in(X) + return self def transform(self, X: pd.DataFrame) -> pd.DataFrame: From 9b2fa4c7591ae9d2cf8b31d73d04476ee37aef10 Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Fri, 27 Mar 2026 02:40:50 +0530 Subject: [PATCH 09/26] creation --- feature_engine/creation/cyclical_features.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/feature_engine/creation/cyclical_features.py b/feature_engine/creation/cyclical_features.py index 40e96cab7..e33ded56a 100644 --- a/feature_engine/creation/cyclical_features.py +++ b/feature_engine/creation/cyclical_features.py @@ -147,11 +147,16 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): It is not needed in this transformer. You can pass y or None. """ if self.max_values is None: - X = super().fit(X) - self.max_values_ = X[self.variables_].max().to_dict() + X, variables_ = self._fit_setup(X) + max_values_ = X[variables_].max().to_dict() else: - super()._fit_from_dict(X, self.max_values) - self.max_values_ = self.max_values + X = super()._fit_from_dict(X, self.max_values) + variables_ = self.variables_ + max_values_ = self.max_values + + self.variables_ = variables_ + self.max_values_ = max_values_ + self._get_feature_names_in(X) return self From 64c9e741c616a98041e95e235a35ef69a9597362 Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Fri, 27 Mar 2026 02:41:10 +0530 Subject: [PATCH 10/26] imputation --- feature_engine/imputation/categorical.py | 22 ++++++++++-------- .../imputation/drop_missing_data.py | 9 ++++---- .../imputation/missing_indicator.py | 9 ++++---- feature_engine/imputation/random_sample.py | 23 +++++++++++-------- 4 files changed, 35 insertions(+), 28 deletions(-) diff --git a/feature_engine/imputation/categorical.py b/feature_engine/imputation/categorical.py index 8c4000a0c..f329c3b44 100644 --- a/feature_engine/imputation/categorical.py +++ b/feature_engine/imputation/categorical.py @@ -169,22 +169,22 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): # select variables to encode if self.ignore_format is True: if self.variables is None: - self.variables_ = find_all_variables(X) + variables_ = find_all_variables(X) else: - self.variables_ = check_all_variables(X, self.variables) + variables_ = check_all_variables(X, self.variables) else: if self.variables is None: - self.variables_ = find_categorical_variables(X) + variables_ = find_categorical_variables(X) else: - self.variables_ = check_categorical_variables(X, self.variables) + variables_ = check_categorical_variables(X, self.variables) if self.imputation_method == "missing": - self.imputer_dict_ = {var: self.fill_value for var in self.variables_} + imputer_dict_ = {var: self.fill_value for var in variables_} elif self.imputation_method == "frequent": # if imputing only 1 variable: - if len(self.variables_) == 1: - var = self.variables_[0] + if len(variables_) == 1: + var = variables_[0] mode_vals = X[var].mode() # Some variables may contain more than 1 mode: @@ -193,13 +193,13 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): f"The variable {var} contains multiple frequent categories." ) - self.imputer_dict_ = {var: mode_vals[0]} + imputer_dict_ = {var: mode_vals[0]} # imputing multiple variables: else: # Returns a dataframe with 1 row if there is one mode per # variable, or more rows if there are more modes: - mode_vals = X[self.variables_].mode() + mode_vals = X[variables_].mode() # Careful: some variables contain multiple modes if len(mode_vals) > 1: @@ -213,8 +213,10 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): f"categories." ) - self.imputer_dict_ = mode_vals.iloc[0].to_dict() + imputer_dict_ = mode_vals.iloc[0].to_dict() + self.variables_ = variables_ + self.imputer_dict_ = imputer_dict_ self._get_feature_names_in(X) return self diff --git a/feature_engine/imputation/drop_missing_data.py b/feature_engine/imputation/drop_missing_data.py index 07c6f3e75..12de75478 100644 --- a/feature_engine/imputation/drop_missing_data.py +++ b/feature_engine/imputation/drop_missing_data.py @@ -150,16 +150,17 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): # find variables for which indicator should be added if self.variables is None: - self.variables_ = find_all_variables(X) + variables_ = find_all_variables(X) else: - self.variables_ = check_all_variables(X, self.variables) + variables_ = check_all_variables(X, self.variables) # If user passes a threshold, then missing_only is ignored: if self.threshold is None and self.missing_only is True: - self.variables_ = [ - var for var in self.variables_ if X[var].isnull().sum() > 0 + variables_ = [ + var for var in variables_ if X[var].isnull().sum() > 0 ] + self.variables_ = variables_ self._get_feature_names_in(X) return self diff --git a/feature_engine/imputation/missing_indicator.py b/feature_engine/imputation/missing_indicator.py index 01660a654..a902230c9 100644 --- a/feature_engine/imputation/missing_indicator.py +++ b/feature_engine/imputation/missing_indicator.py @@ -129,15 +129,16 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): # find variables for which indicator should be added if self.variables is None: - self.variables_ = find_all_variables(X) + variables_ = find_all_variables(X) else: - self.variables_ = check_all_variables(X, self.variables) + variables_ = check_all_variables(X, self.variables) if self.missing_only is True: - self.variables_ = [ - var for var in self.variables_ if X[var].isnull().sum() > 0 + variables_ = [ + var for var in variables_ if X[var].isnull().sum() > 0 ] + self.variables_ = variables_ self._get_feature_names_in(X) return self diff --git a/feature_engine/imputation/random_sample.py b/feature_engine/imputation/random_sample.py index d05aeaac8..f319242e5 100644 --- a/feature_engine/imputation/random_sample.py +++ b/feature_engine/imputation/random_sample.py @@ -184,26 +184,29 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): # find variables to impute if self.variables is None: - self.variables_ = find_all_variables(X) + variables_ = find_all_variables(X) else: - self.variables_ = check_all_variables(X, self.variables) + variables_ = check_all_variables(X, self.variables) # take a copy of the selected variables - self.X_ = X[self.variables_].copy() + X_ = X[variables_].copy() # check the variables assigned to the random state if self.seed == "observation": - self.random_state = _check_variables_input_value(self.random_state) - if isinstance(self.random_state, (int, str)): - self.random_state = [self.random_state] - if self.random_state and any( - var for var in self.random_state if var not in X.columns + random_state = _check_variables_input_value(self.random_state) + if isinstance(random_state, (int, str)): + random_state = [random_state] + if random_state and any( + var for var in random_state if var not in X.columns ): raise ValueError( - "There are variables assigned as random state which are not part " - "of the training dataframe." + "One or more of the variables indicated in random_state " + "is not present in the dataframe." ) + self.random_state = random_state + self.variables_ = variables_ + self.X_ = X_ self._get_feature_names_in(X) return self From 50075fb0edcc03a7ba8b2f1a9782995596b90c63 Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Fri, 27 Mar 2026 02:41:36 +0530 Subject: [PATCH 11/26] transformation --- feature_engine/transformation/arcsin.py | 4 +++- feature_engine/transformation/arcsinh.py | 4 +++- feature_engine/transformation/boxcox.py | 12 ++++++++---- feature_engine/transformation/log.py | 8 ++++++-- feature_engine/transformation/power.py | 4 +++- feature_engine/transformation/reciprocal.py | 4 +++- feature_engine/transformation/yeojohnson.py | 12 ++++++++---- 7 files changed, 34 insertions(+), 14 deletions(-) diff --git a/feature_engine/transformation/arcsin.py b/feature_engine/transformation/arcsin.py index 059df813e..acf2b2670 100644 --- a/feature_engine/transformation/arcsin.py +++ b/feature_engine/transformation/arcsin.py @@ -121,7 +121,9 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ # check input dataframe - X = super().fit(X) + X, variables_ = self._fit_setup(X) + self.variables_ = variables_ + self._get_feature_names_in(X) # check if the variables are in the correct range if ((X[self.variables_] < 0) | (X[self.variables_] > 1)).any().any(): diff --git a/feature_engine/transformation/arcsinh.py b/feature_engine/transformation/arcsinh.py index e0020ff86..70e627a23 100644 --- a/feature_engine/transformation/arcsinh.py +++ b/feature_engine/transformation/arcsinh.py @@ -161,7 +161,9 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ # check input dataframe and find/check numerical variables - X = super().fit(X) + X, variables_ = self._fit_setup(X) + self.variables_ = variables_ + self._get_feature_names_in(X) return self diff --git a/feature_engine/transformation/boxcox.py b/feature_engine/transformation/boxcox.py index 1541ff8b5..b1e817b7c 100644 --- a/feature_engine/transformation/boxcox.py +++ b/feature_engine/transformation/boxcox.py @@ -135,12 +135,16 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ # check input dataframe - X = super().fit(X) + X, variables_ = self._fit_setup(X) - self.lambda_dict_ = {} + lambda_dict_ = {} - for var in self.variables_: - _, self.lambda_dict_[var] = stats.boxcox(X[var]) + for var in variables_: + _, lambda_dict_[var] = stats.boxcox(X[var]) + + self.variables_ = variables_ + self.lambda_dict_ = lambda_dict_ + self._get_feature_names_in(X) return self diff --git a/feature_engine/transformation/log.py b/feature_engine/transformation/log.py index 695243291..c512e2a6c 100644 --- a/feature_engine/transformation/log.py +++ b/feature_engine/transformation/log.py @@ -128,7 +128,9 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ # check input dataframe - X = super().fit(X) + X, variables_ = self._fit_setup(X) + self.variables_ = variables_ + self._get_feature_names_in(X) # check contains zero or negative values if (X[self.variables_] <= 0).any().any(): @@ -358,7 +360,9 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): if isinstance(self.C, dict): X = super()._fit_from_dict(X, self.C) else: - X = super().fit(X) + X, variables_ = self._fit_setup(X) + self.variables_ = variables_ + self._get_feature_names_in(X) self.C_ = self.C diff --git a/feature_engine/transformation/power.py b/feature_engine/transformation/power.py index ae10a16bf..12e737b64 100644 --- a/feature_engine/transformation/power.py +++ b/feature_engine/transformation/power.py @@ -121,7 +121,9 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ # check input dataframe - super().fit(X) + X, variables_ = self._fit_setup(X) + self.variables_ = variables_ + self._get_feature_names_in(X) return self diff --git a/feature_engine/transformation/reciprocal.py b/feature_engine/transformation/reciprocal.py index d51557331..b00d72603 100644 --- a/feature_engine/transformation/reciprocal.py +++ b/feature_engine/transformation/reciprocal.py @@ -112,7 +112,9 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ # check input dataframe - X = super().fit(X) + X, variables_ = self._fit_setup(X) + self.variables_ = variables_ + self._get_feature_names_in(X) # check if the variables contain the value 0 if (X[self.variables_] == 0).any().any(): diff --git a/feature_engine/transformation/yeojohnson.py b/feature_engine/transformation/yeojohnson.py index f8d938e4a..0103ceac7 100644 --- a/feature_engine/transformation/yeojohnson.py +++ b/feature_engine/transformation/yeojohnson.py @@ -128,12 +128,16 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ # check input dataframe - X = super().fit(X) + X, variables_ = self._fit_setup(X) - self.lambda_dict_ = {} + lambda_dict_ = {} - for var in self.variables_: - _, self.lambda_dict_[var] = stats.yeojohnson(X[var]) + for var in variables_: + _, lambda_dict_[var] = stats.yeojohnson(X[var]) + + self.variables_ = variables_ + self.lambda_dict_ = lambda_dict_ + self._get_feature_names_in(X) return self From f2e944fc28875ffaf318d2db42e26cfe4bd41e59 Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Fri, 27 Mar 2026 02:41:56 +0530 Subject: [PATCH 12/26] tests --- .../test_base_numerical_transformer.py | 10 ++++++++-- .../test_check_estimator_discretisers.py | 17 +++++++++++++++++ 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/tests/test_base_transformers/test_base_numerical_transformer.py b/tests/test_base_transformers/test_base_numerical_transformer.py index 1629ab67e..aa48aa8a7 100644 --- a/tests/test_base_transformers/test_base_numerical_transformer.py +++ b/tests/test_base_transformers/test_base_numerical_transformer.py @@ -7,8 +7,14 @@ class MockClass(BaseNumericalTransformer): - def __init__(self): - self.variables = None + def __init__(self, variables=None): + self.variables = variables + + def fit(self, X, y=None): + X, variables_ = self._fit_setup(X) + self.variables_ = variables_ + self._get_feature_names_in(X) + return X def transform(self, X): return self._check_transform_input_and_state(X) diff --git a/tests/test_discretisation/test_check_estimator_discretisers.py b/tests/test_discretisation/test_check_estimator_discretisers.py index 87e175eac..d151c3080 100644 --- a/tests/test_discretisation/test_check_estimator_discretisers.py +++ b/tests/test_discretisation/test_check_estimator_discretisers.py @@ -1,3 +1,6 @@ +from sklearn import clone +from sklearn.exceptions import NotFittedError + import numpy as np import pandas as pd import pytest @@ -63,3 +66,17 @@ def test_transformers_within_pipeline(transformer): Xtp = pipe.fit_transform(X, y) pd.testing.assert_frame_equal(Xtt, Xtp) + + +@pytest.mark.parametrize("estimator", _estimators) +def test_raises_non_fitted_error_when_error_during_fit(estimator): + estimator = clone(estimator) + + X = pd.DataFrame({"cat1": ["a", "b", "c", "a", "b"]}) + y = pd.Series([0, 1, 0, 1, 0]) + + with pytest.raises((ValueError, TypeError, KeyError)): + estimator.fit(X, y) + + with pytest.raises(NotFittedError): + estimator.transform(X) From ca9241ea92bed879eb7d6f03795d3bcfcf5d3a5a Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Fri, 27 Mar 2026 02:44:43 +0530 Subject: [PATCH 13/26] creation --- feature_engine/creation/cyclical_features.py | 3 ++- feature_engine/discretisation/decision_tree.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/feature_engine/creation/cyclical_features.py b/feature_engine/creation/cyclical_features.py index e33ded56a..cae0fc525 100644 --- a/feature_engine/creation/cyclical_features.py +++ b/feature_engine/creation/cyclical_features.py @@ -146,12 +146,13 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): y: pandas Series, default=None It is not needed in this transformer. You can pass y or None. """ + variables_: List[Union[str, int]] if self.max_values is None: X, variables_ = self._fit_setup(X) max_values_ = X[variables_].max().to_dict() else: X = super()._fit_from_dict(X, self.max_values) - variables_ = self.variables_ + variables_ = self.variables # type: ignore max_values_ = self.max_values self.variables_ = variables_ diff --git a/feature_engine/discretisation/decision_tree.py b/feature_engine/discretisation/decision_tree.py index b75cec5e1..648771a1e 100644 --- a/feature_engine/discretisation/decision_tree.py +++ b/feature_engine/discretisation/decision_tree.py @@ -214,7 +214,7 @@ def __init__( self.param_grid = param_grid self.random_state = random_state - def fit(self, X: pd.DataFrame, y: pd.Series): # type: ignore + def fit(self, X: pd.DataFrame, y: pd.Series): """ Fit one decision tree per variable to discretize with cross-validation and grid-search for hyperparameters. From d813e2d62c0d4efbfc08d7eb81c9893db0773b02 Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Fri, 27 Mar 2026 02:50:24 +0530 Subject: [PATCH 14/26] verified changes for checks --- feature_engine/_base_transformers/mixins.py | 14 ++++++++------ feature_engine/creation/cyclical_features.py | 4 +--- feature_engine/discretisation/arbitrary.py | 3 ++- feature_engine/transformation/log.py | 20 ++++++++++---------- 4 files changed, 21 insertions(+), 20 deletions(-) diff --git a/feature_engine/_base_transformers/mixins.py b/feature_engine/_base_transformers/mixins.py index cc6d57b6d..004f3cef7 100644 --- a/feature_engine/_base_transformers/mixins.py +++ b/feature_engine/_base_transformers/mixins.py @@ -1,4 +1,4 @@ -from typing import Dict, List, Union +from typing import Dict, List, Tuple, Union import pandas as pd from numpy import ndarray @@ -46,7 +46,9 @@ def transform_x_y(self, X: pd.DataFrame, y: pd.Series): class FitFromDictMixin: - def _fit_from_dict(self, X: pd.DataFrame, user_dict_: Dict) -> pd.DataFrame: + def _fit_from_dict( + self, X: pd.DataFrame, user_dict_: Dict + ) -> Tuple[pd.DataFrame, List[Union[str, int]]]: """ Checks that input is a dataframe, checks that variables in the dictionary entered by the user are of type numerical. @@ -71,6 +73,9 @@ def _fit_from_dict(self, X: pd.DataFrame, user_dict_: Dict) -> pd.DataFrame: ------- X : Pandas DataFrame The same dataframe entered as parameter + + variables_ : List + The variables in the dictionary. """ # check input dataframe X = check_X(X) @@ -83,10 +88,7 @@ def _fit_from_dict(self, X: pd.DataFrame, user_dict_: Dict) -> pd.DataFrame: _check_contains_na(X, variables_) _check_contains_inf(X, variables_) - self.variables_ = variables_ - self._get_feature_names_in(X) - - return X + return X, variables_ class GetFeatureNamesOutMixin: diff --git a/feature_engine/creation/cyclical_features.py b/feature_engine/creation/cyclical_features.py index cae0fc525..849c71a3d 100644 --- a/feature_engine/creation/cyclical_features.py +++ b/feature_engine/creation/cyclical_features.py @@ -146,13 +146,11 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): y: pandas Series, default=None It is not needed in this transformer. You can pass y or None. """ - variables_: List[Union[str, int]] if self.max_values is None: X, variables_ = self._fit_setup(X) max_values_ = X[variables_].max().to_dict() else: - X = super()._fit_from_dict(X, self.max_values) - variables_ = self.variables # type: ignore + X, variables_ = super()._fit_from_dict(X, self.max_values) max_values_ = self.max_values self.variables_ = variables_ diff --git a/feature_engine/discretisation/arbitrary.py b/feature_engine/discretisation/arbitrary.py index d7442670f..6a3e9c468 100644 --- a/feature_engine/discretisation/arbitrary.py +++ b/feature_engine/discretisation/arbitrary.py @@ -151,9 +151,10 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): y is not needed in this transformer. You can pass y or None. """ # check input dataframe - X = super()._fit_from_dict(X, self.binning_dict) + X, variables_ = super()._fit_from_dict(X, self.binning_dict) # for consistency wit the rest of the discretisers, we add this attribute + self.variables_ = variables_ self.binner_dict_ = self.binning_dict self._get_feature_names_in(X) diff --git a/feature_engine/transformation/log.py b/feature_engine/transformation/log.py index c512e2a6c..f6fc1e1f5 100644 --- a/feature_engine/transformation/log.py +++ b/feature_engine/transformation/log.py @@ -358,25 +358,25 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): # check input dataframe if isinstance(self.C, dict): - X = super()._fit_from_dict(X, self.C) + X, variables_ = super()._fit_from_dict(X, self.C) else: X, variables_ = self._fit_setup(X) - self.variables_ = variables_ - self._get_feature_names_in(X) - - self.C_ = self.C # calculate C to add to each variable if self.C == "auto": # we add 0 to positive variables - c_dict = {var: 0 for var in self.variables_ if X[var].min() > 0} + c_dict = {var: 0 for var in variables_ if X[var].min() > 0} # we add the minimum plus 1 to non-positive variables - non_positive_vars = [ - var for var in self.variables_ if var not in c_dict.keys() - ] + non_positive_vars = [var for var in variables_ if var not in c_dict.keys()] c_dict.update(dict(X[non_positive_vars].min(axis=0).abs() + 1)) - self.C_ = c_dict # type:ignore + C_ = c_dict + else: + C_ = self.C + + self.variables_ = variables_ + self.C_ = C_ + self._get_feature_names_in(X) return self From 519ca1dd412894af73e17caf17851df687659cc6 Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Fri, 27 Mar 2026 02:53:41 +0530 Subject: [PATCH 15/26] verified changes for checks --- feature_engine/transformation/log.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/feature_engine/transformation/log.py b/feature_engine/transformation/log.py index f6fc1e1f5..71b685209 100644 --- a/feature_engine/transformation/log.py +++ b/feature_engine/transformation/log.py @@ -127,7 +127,6 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): It is not needed in this transformer. You can pass y or None. """ - # check input dataframe X, variables_ = self._fit_setup(X) self.variables_ = variables_ self._get_feature_names_in(X) @@ -363,13 +362,17 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): X, variables_ = self._fit_setup(X) # calculate C to add to each variable + C_: Union[int, float, Dict[Union[str, int], Union[float, int]]] if self.C == "auto": # we add 0 to positive variables - c_dict = {var: 0 for var in variables_ if X[var].min() > 0} + c_dict: Dict[Union[str, int], Union[float, int]] = { + var: 0.0 for var in variables_ if X[var].min() > 0 + } # we add the minimum plus 1 to non-positive variables non_positive_vars = [var for var in variables_ if var not in c_dict.keys()] - c_dict.update(dict(X[non_positive_vars].min(axis=0).abs() + 1)) + if non_positive_vars: + c_dict.update(dict(X[non_positive_vars].min(axis=0).abs() + 1)) C_ = c_dict else: C_ = self.C From 44f75c049900089a313547368415f3824d14843e Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Fri, 27 Mar 2026 03:00:36 +0530 Subject: [PATCH 16/26] value error --- feature_engine/transformation/log.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/feature_engine/transformation/log.py b/feature_engine/transformation/log.py index 71b685209..9506c3a32 100644 --- a/feature_engine/transformation/log.py +++ b/feature_engine/transformation/log.py @@ -374,8 +374,13 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): if non_positive_vars: c_dict.update(dict(X[non_positive_vars].min(axis=0).abs() + 1)) C_ = c_dict - else: + elif isinstance(self.C, (int, float, dict)): C_ = self.C + else: + raise ValueError( + f"C can take only 'auto', integers, floats or dicts. " + f"Got {self.C} instead." + ) self.variables_ = variables_ self.C_ = C_ From 7b3c5927c0f2e3b8d7d68b3f8911543f005a13d4 Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Fri, 27 Mar 2026 19:21:30 +0530 Subject: [PATCH 17/26] ADDED:`test_raises_non_fitted_error_when_error_during_fit` --- .../test_check_estimator_creation.py | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests/test_creation/test_check_estimator_creation.py b/tests/test_creation/test_check_estimator_creation.py index e3c22caa1..781076f91 100644 --- a/tests/test_creation/test_check_estimator_creation.py +++ b/tests/test_creation/test_check_estimator_creation.py @@ -1,3 +1,6 @@ +from sklearn import clone +from sklearn.exceptions import NotFittedError + import pandas as pd import pytest import sklearn @@ -97,3 +100,24 @@ def test_geo_distance_transformer_in_pipeline(): Xtp = pipe.fit_transform(X.copy(), y) pd.testing.assert_frame_equal(Xtt, Xtp) + + +@pytest.mark.parametrize("estimator", _estimators) +def test_raises_non_fitted_error_when_error_during_fit(estimator): + estimator = clone(estimator) + + X = pd.DataFrame({"cat1": ["a", "b", "c", "a", "b"]}) + y = pd.Series([0, 1, 0, 1, 0]) + + # If variables are provided, we need to ensure they are in the dataframe + # or handle the KeyError. + if hasattr(estimator, "variables") and estimator.variables: + X = pd.DataFrame( + {var: ["a", "b", "c", "a", "b"] for var in estimator.variables} + ) + + with pytest.raises((ValueError, TypeError, KeyError)): + estimator.fit(X, y) + + with pytest.raises(NotFittedError): + estimator.transform(X) From 6babea2fdc541c8ada2a13bb841bdd7bc85fd392 Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Fri, 27 Mar 2026 19:23:54 +0530 Subject: [PATCH 18/26] added:`test_raises_non_fitted_error_when_error_during_fit` --- .../test_check_estimator_transformers.py | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/tests/test_transformation/test_check_estimator_transformers.py b/tests/test_transformation/test_check_estimator_transformers.py index 8f482e10d..5ab17b7d6 100644 --- a/tests/test_transformation/test_check_estimator_transformers.py +++ b/tests/test_transformation/test_check_estimator_transformers.py @@ -1,3 +1,6 @@ +from sklearn import clone +from sklearn.exceptions import NotFittedError + import pandas as pd import pytest import sklearn @@ -95,3 +98,25 @@ def test_transformers_in_pipeline_with_set_output_pandas(transformer): Xtp = pipe.fit_transform(X, y) pd.testing.assert_frame_equal(Xtt, Xtp) + + +@pytest.mark.parametrize("estimator", _estimators) +def test_raises_non_fitted_error_when_error_during_fit(estimator): + estimator = clone(estimator) + + if estimator.__class__.__name__ == "BoxCoxTransformer": + X = pd.DataFrame({"num1": [-1.0, 2.0, 3.0, 4.0, 5.0]}) + elif estimator.__class__.__name__ == "ArcsinTransformer": + X = pd.DataFrame({"num1": [1.1, 2.0, 3.0, 4.0, 5.0]}) + elif estimator.__class__.__name__ == "LogTransformer": + X = pd.DataFrame({"num1": [-1.0, 2.0, 3.0, 4.0, 5.0]}) + elif estimator.__class__.__name__ == "ReciprocalTransformer": + X = pd.DataFrame({"num1": [0.0, 2.0, 3.0, 4.0, 5.0]}) + else: + X = pd.DataFrame() + + with pytest.raises((ValueError, TypeError)): + estimator.fit(X) + + with pytest.raises(NotFittedError): + estimator.transform(X) From c945deee27829c9d845ab6fef0ab99c1ccdc3cfa Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Fri, 27 Mar 2026 19:24:26 +0530 Subject: [PATCH 19/26] addEd:`test_raises_non_fitted_error_when_error_during_fit` --- tests/test_scaling/test_mean_normalization.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tests/test_scaling/test_mean_normalization.py b/tests/test_scaling/test_mean_normalization.py index 240cb7d3f..17dd4cf62 100644 --- a/tests/test_scaling/test_mean_normalization.py +++ b/tests/test_scaling/test_mean_normalization.py @@ -2,6 +2,7 @@ import pandas as pd import pytest +from sklearn import clone from sklearn.exceptions import NotFittedError from feature_engine.scaling import MeanNormalizationScaler @@ -126,3 +127,21 @@ def test_constant_columns_error(): transformer = MeanNormalizationScaler() with pytest.raises(ValueError, match=re.escape("Division by zero is not allowed")): transformer.fit(df) + + +def test_raises_non_fitted_error_when_error_during_fit(): + # input test case + df = pd.DataFrame( + { + "var1": [1.0, 2.0, 3.0], + "var2": [4.0, 5.0, 3.0], + "var3": [7.0, 7.0, 7.0], + } + ) + + transformer = MeanNormalizationScaler() + with pytest.raises(ValueError, match=re.escape("Division by zero is not allowed")): + transformer.fit(df) + + with pytest.raises(NotFittedError): + transformer.transform(df) From b631f63c191d650fb2fcb13cc4eff7586d5cb48f Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Fri, 27 Mar 2026 19:24:48 +0530 Subject: [PATCH 20/26] tranformers --- feature_engine/transformation/arcsin.py | 7 ++++--- feature_engine/transformation/log.py | 7 ++++--- feature_engine/transformation/reciprocal.py | 7 ++++--- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/feature_engine/transformation/arcsin.py b/feature_engine/transformation/arcsin.py index acf2b2670..cc6cc8c49 100644 --- a/feature_engine/transformation/arcsin.py +++ b/feature_engine/transformation/arcsin.py @@ -122,16 +122,17 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): # check input dataframe X, variables_ = self._fit_setup(X) - self.variables_ = variables_ - self._get_feature_names_in(X) # check if the variables are in the correct range - if ((X[self.variables_] < 0) | (X[self.variables_] > 1)).any().any(): + if ((X[variables_] < 0) | (X[variables_] > 1)).any().any(): raise ValueError( "Some variables contain values outside the possible range 0-1. " "Can't apply the arcsin transformation. " ) + self.variables_ = variables_ + self._get_feature_names_in(X) + return self def transform(self, X: pd.DataFrame) -> pd.DataFrame: diff --git a/feature_engine/transformation/log.py b/feature_engine/transformation/log.py index 9506c3a32..e71c7c25e 100644 --- a/feature_engine/transformation/log.py +++ b/feature_engine/transformation/log.py @@ -128,15 +128,16 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ X, variables_ = self._fit_setup(X) - self.variables_ = variables_ - self._get_feature_names_in(X) # check contains zero or negative values - if (X[self.variables_] <= 0).any().any(): + if (X[variables_] <= 0).any().any(): raise ValueError( "Some variables contain zero or negative values, can't apply log" ) + self.variables_ = variables_ + self._get_feature_names_in(X) + return self def transform(self, X: pd.DataFrame) -> pd.DataFrame: diff --git a/feature_engine/transformation/reciprocal.py b/feature_engine/transformation/reciprocal.py index b00d72603..530d94eec 100644 --- a/feature_engine/transformation/reciprocal.py +++ b/feature_engine/transformation/reciprocal.py @@ -113,16 +113,17 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): # check input dataframe X, variables_ = self._fit_setup(X) - self.variables_ = variables_ - self._get_feature_names_in(X) # check if the variables contain the value 0 - if (X[self.variables_] == 0).any().any(): + if (X[variables_] == 0).any().any(): raise ValueError( "Some variables contain the value zero, can't apply reciprocal " "transformation." ) + self.variables_ = variables_ + self._get_feature_names_in(X) + return self def transform(self, X: pd.DataFrame) -> pd.DataFrame: From 0c681f5a9f7472e31b4b65773524cc24ec4c1a6b Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Fri, 27 Mar 2026 19:30:36 +0530 Subject: [PATCH 21/26] left --- tests/test_scaling/test_mean_normalization.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_scaling/test_mean_normalization.py b/tests/test_scaling/test_mean_normalization.py index 17dd4cf62..1a411ea1b 100644 --- a/tests/test_scaling/test_mean_normalization.py +++ b/tests/test_scaling/test_mean_normalization.py @@ -2,7 +2,6 @@ import pandas as pd import pytest -from sklearn import clone from sklearn.exceptions import NotFittedError from feature_engine.scaling import MeanNormalizationScaler From 6898a3db1cffcaee9d5164d0661f6e406e76c982 Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Sun, 29 Mar 2026 00:22:48 +0530 Subject: [PATCH 22/26] Updated the `Decisiontreefeatures` and `GeoDIstanceFeatures` --- feature_engine/creation/decision_tree_features.py | 6 +++++- feature_engine/creation/geo_features.py | 12 +++++++----- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/feature_engine/creation/decision_tree_features.py b/feature_engine/creation/decision_tree_features.py index 8ec2030aa..22c430655 100644 --- a/feature_engine/creation/decision_tree_features.py +++ b/feature_engine/creation/decision_tree_features.py @@ -260,6 +260,8 @@ def fit(self, X: pd.DataFrame, y: pd.Series): y: pandas Series or np.array = [n_samples,] The target variable that is used to train the decision tree. """ + y = pd.Series(y) + # confirm model type and target variables are compatible. if self.regression is True: if type_of_target(y) == "binary": @@ -268,9 +270,10 @@ def fit(self, X: pd.DataFrame, y: pd.Series): "allowed by this transformer. Check the target values " "or set regression to False." ) + is_binary = None else: check_classification_targets(y) - self._is_binary = type_of_target(y) + is_binary = type_of_target(y) X, y = check_X_y(X, y) @@ -310,6 +313,7 @@ def fit(self, X: pd.DataFrame, y: pd.Series): self.variables_ = variables_ self.input_features_ = input_features self.estimators_ = estimators_ + self._is_binary = is_binary self.feature_names_in_ = X.columns.tolist() self.n_features_in_ = X.shape[1] diff --git a/feature_engine/creation/geo_features.py b/feature_engine/creation/geo_features.py index 568ed12c4..36c4191ec 100644 --- a/feature_engine/creation/geo_features.py +++ b/feature_engine/creation/geo_features.py @@ -234,8 +234,7 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): # check input dataframe X = check_X(X) - # Store coordinate variables - self.variables_: List[Union[str, int]] = [ + variables = [ self.lat1, self.lon1, self.lat2, @@ -243,17 +242,17 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): ] # Check all coordinate columns exist - missing = set(self.variables_) - set(X.columns) + missing = set(variables) - set(X.columns) if missing: raise ValueError( f"Coordinate columns {missing} are not present in the dataframe." ) # Check coordinate columns are numerical - check_numerical_variables(X, self.variables_) + check_numerical_variables(X, variables) # Check for missing values - _check_contains_na(X, self.variables_) + _check_contains_na(X, variables) # Validate coordinate ranges if enabled if self.validate_ranges: @@ -269,6 +268,9 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): f"Longitude values in '{lon_col}' must be between -180 and 180." ) + # save coordinate variables + self.variables_ = variables + # save input features self.feature_names_in_ = X.columns.tolist() From b807ef40e078b84af17e8e651cf3fbb06476ded0 Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Sun, 29 Mar 2026 00:27:40 +0530 Subject: [PATCH 23/26] fixed `geo_features.py` --- feature_engine/creation/geo_features.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feature_engine/creation/geo_features.py b/feature_engine/creation/geo_features.py index 36c4191ec..753a7a51f 100644 --- a/feature_engine/creation/geo_features.py +++ b/feature_engine/creation/geo_features.py @@ -234,7 +234,7 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): # check input dataframe X = check_X(X) - variables = [ + variables: List[Union[str, int]] = [ self.lat1, self.lon1, self.lat2, From 3de99687508a3977f5b64761bc8a500ae6167dd1 Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Sun, 29 Mar 2026 00:39:24 +0530 Subject: [PATCH 24/26] Improved failure triggers --- .../test_check_estimator_creation.py | 30 ++++++++++++------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/tests/test_creation/test_check_estimator_creation.py b/tests/test_creation/test_check_estimator_creation.py index 781076f91..dc4ceb42c 100644 --- a/tests/test_creation/test_check_estimator_creation.py +++ b/tests/test_creation/test_check_estimator_creation.py @@ -19,10 +19,6 @@ sklearn_version = parse_version(parse_version(sklearn.__version__).base_version) -# Estimators for sklearn's check_estimator -# Note: GeoDistanceFeatures is not included here because it requires 4 specific -# named coordinate columns, but sklearn's check_estimator generates test data -# with generic column names (x0, x1, x2) that don't match the required columns. _estimators = [ MathFeatures(variables=["x0", "x1"], func="mean", missing_values="ignore"), RelativeFeatures( @@ -102,19 +98,33 @@ def test_geo_distance_transformer_in_pipeline(): pd.testing.assert_frame_equal(Xtt, Xtp) -@pytest.mark.parametrize("estimator", _estimators) +@pytest.mark.parametrize( + "estimator", + [ + CyclicalFeatures(), + MathFeatures(variables=["feature_1", "feature_2"], func=["sum", "mean"]), + RelativeFeatures(variables=["feature_1"], reference=["feature_2"], func=["div"]), + DecisionTreeFeatures(regression=False), + GeoDistanceFeatures(lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2"), + ], +) def test_raises_non_fitted_error_when_error_during_fit(estimator): estimator = clone(estimator) + X = pd.DataFrame({"cat1": ["a", "b", "c", "a", "b"]}) y = pd.Series([0, 1, 0, 1, 0]) - # If variables are provided, we need to ensure they are in the dataframe - # or handle the KeyError. + if hasattr(estimator, "variables") and estimator.variables: - X = pd.DataFrame( - {var: ["a", "b", "c", "a", "b"] for var in estimator.variables} - ) + X = pd.DataFrame({var: ["a", "b", "c", "a", "b"] for var in estimator.variables}) + elif isinstance(estimator, GeoDistanceFeatures): + X = pd.DataFrame({ + "lat1": ["a", "b"], + "lon1": ["c", "d"], + "lat2": ["e", "f"], + "lon2": ["g", "h"], + }) with pytest.raises((ValueError, TypeError, KeyError)): estimator.fit(X, y) From 9a5973d5f7b1cfc970e28790e4c01e7dad6c0fc7 Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Sun, 29 Mar 2026 00:42:01 +0530 Subject: [PATCH 25/26] Improved failure triggers --- .../test_check_estimator_creation.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/tests/test_creation/test_check_estimator_creation.py b/tests/test_creation/test_check_estimator_creation.py index dc4ceb42c..dcfeaede6 100644 --- a/tests/test_creation/test_check_estimator_creation.py +++ b/tests/test_creation/test_check_estimator_creation.py @@ -102,22 +102,28 @@ def test_geo_distance_transformer_in_pipeline(): "estimator", [ CyclicalFeatures(), - MathFeatures(variables=["feature_1", "feature_2"], func=["sum", "mean"]), - RelativeFeatures(variables=["feature_1"], reference=["feature_2"], func=["div"]), + MathFeatures( + variables=["feature_1", "feature_2"], func=["sum", "mean"] + ), + RelativeFeatures( + variables=["feature_1"], reference=["feature_2"], func=["div"] + ), DecisionTreeFeatures(regression=False), - GeoDistanceFeatures(lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2"), + GeoDistanceFeatures( + lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2" + ), ], ) def test_raises_non_fitted_error_when_error_during_fit(estimator): estimator = clone(estimator) - X = pd.DataFrame({"cat1": ["a", "b", "c", "a", "b"]}) y = pd.Series([0, 1, 0, 1, 0]) - if hasattr(estimator, "variables") and estimator.variables: - X = pd.DataFrame({var: ["a", "b", "c", "a", "b"] for var in estimator.variables}) + X = pd.DataFrame( + {var: ["a", "b", "c", "a", "b"] for var in estimator.variables} + ) elif isinstance(estimator, GeoDistanceFeatures): X = pd.DataFrame({ "lat1": ["a", "b"], From aa75e97b50fcbecf7c7b09e4c1d53d207bc49133 Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Sun, 29 Mar 2026 01:27:02 +0530 Subject: [PATCH 26/26] fixed --- .../creation/decision_tree_features.py | 21 +------------------ 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/feature_engine/creation/decision_tree_features.py b/feature_engine/creation/decision_tree_features.py index 22c430655..e026b1278 100644 --- a/feature_engine/creation/decision_tree_features.py +++ b/feature_engine/creation/decision_tree_features.py @@ -260,9 +260,7 @@ def fit(self, X: pd.DataFrame, y: pd.Series): y: pandas Series or np.array = [n_samples,] The target variable that is used to train the decision tree. """ - y = pd.Series(y) - - # confirm model type and target variables are compatible. + X, y = check_X_y(X, y) if self.regression is True: if type_of_target(y) == "binary": raise ValueError( @@ -275,15 +273,12 @@ def fit(self, X: pd.DataFrame, y: pd.Series): check_classification_targets(y) is_binary = type_of_target(y) - X, y = check_X_y(X, y) - # find or check for numerical variables if self.variables is None: variables_ = find_numerical_variables(X) else: variables_ = check_numerical_variables(X, self.variables) - # check if dataset contains na or inf _check_contains_na(X, variables_) _check_contains_inf(X, variables_) @@ -292,7 +287,6 @@ def fit(self, X: pd.DataFrame, y: pd.Series): else: param_grid = {"max_depth": [1, 2, 3, 4]} - # get the sets of variables that will be used to create new features input_features = self._create_variable_combinations( how_to_combine=self.features_to_combine, variables=variables_ ) @@ -301,7 +295,6 @@ def fit(self, X: pd.DataFrame, y: pd.Series): for features in input_features: estimator = self._make_decision_tree(param_grid=param_grid) - # single feature models if isinstance(features, str): estimator.fit(X[features].to_frame(), y) # multi feature models @@ -334,24 +327,17 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: Either the original dataframe plus the new features or a dataframe of only the new features. """ - # Check method fit has been called check_is_fitted(self) - # check that input is a dataframe X = check_X(X) - # Check if input data contains same number of columns as dataframe used to fit. _check_X_matches_training_df(X, self.n_features_in_) - # check if dataset contains na or inf _check_contains_na(X, self.variables_) _check_contains_inf(X, self.variables_) - # reorder variables to match train set X = X[self.feature_names_in_] - # create new features and add them to the original dataframe - # if regression or multiclass, we return the output of predict() if self.regression is True: for features, estimator in zip(self.input_features_, self.estimators_): if isinstance(features, str): @@ -365,7 +351,6 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: preds = np.round(preds, self.precision) X.loc[:, f"tree({features})"] = preds - # if binary classification, we return the probability elif self._is_binary == "binary": for features, estimator in zip(self.input_features_, self.estimators_): if isinstance(features, str): @@ -379,7 +364,6 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: preds = np.round(preds, self.precision) X.loc[:, f"tree({features})"] = preds[:, 1] - # if multiclass, we return the output of predict() else: for features, estimator in zip(self.input_features_, self.estimators_): if isinstance(features, str): @@ -441,7 +425,6 @@ def _create_variable_combinations( else: combos.append(list(feature)) - # if output_features is None, int or list. else: if how_to_combine is None: if len(variables) == 1: @@ -456,7 +439,6 @@ def _create_variable_combinations( els = [list(x) for x in itertools.combinations(variables, i)] combos += els - # output_feature is a list else: for i in how_to_combine: els = [list(x) for x in itertools.combinations(variables, i)] @@ -469,7 +451,6 @@ def _get_new_features_name(self) -> List: feature_names = [f"tree({combo})" for combo in self.input_features_] return feature_names - # for the check_estimator tests def _more_tags(self): tags_dict = _return_tags() tags_dict["requires_y"] = True