diff --git a/feature_engine/_base_transformers/base_numerical.py b/feature_engine/_base_transformers/base_numerical.py index 60212f3d6..d03cf9065 100644 --- a/feature_engine/_base_transformers/base_numerical.py +++ b/feature_engine/_base_transformers/base_numerical.py @@ -28,53 +28,34 @@ class BaseNumericalTransformer( variable transformers, discretisers, math combination. """ - def fit(self, X: pd.DataFrame) -> pd.DataFrame: + def _fit_setup(self, X: pd.DataFrame): """ - Checks that input is a dataframe, finds numerical variables, or alternatively - checks that variables entered by the user are of type numerical. - - Parameters - ---------- - X : Pandas DataFrame - - y : Pandas Series, np.array. Default = None - Parameter is necessary for compatibility with sklearn Pipeline. - - Raises - ------ - TypeError - If the input is not a Pandas DataFrame or a numpy array - If any of the user provided variables are not numerical - ValueError - If there are no numerical variables in the df or the df is empty - If the variable(s) contain null values - - Returns - ------- - X : Pandas DataFrame - The same dataframe entered as parameter + Check dataframe, find numerical variables, check for NA and Inf. + Returns the checked dataframe and the correctly identified numerical variables. """ - # check input dataframe X = check_X(X) # find or check for numerical variables if self.variables is None: - self.variables_ = find_numerical_variables(X) + variables_ = find_numerical_variables(X) else: - self.variables_ = check_numerical_variables(X, self.variables) + variables_ = check_numerical_variables(X, self.variables) # check if dataset contains na or inf - _check_contains_na(X, self.variables_) - _check_contains_inf(X, self.variables_) + _check_contains_na(X, variables_) + _check_contains_inf(X, variables_) - # save input features - self.feature_names_in_ = X.columns.tolist() + return X, variables_ - # save train set shape + def _get_feature_names_in(self, X): + """Get the names and number of features in the train set (the dataframe + used during fit).""" + + self.feature_names_in_ = X.columns.to_list() self.n_features_in_ = X.shape[1] - return X + return self def _check_transform_input_and_state(self, X: pd.DataFrame) -> pd.DataFrame: """ diff --git a/feature_engine/_base_transformers/mixins.py b/feature_engine/_base_transformers/mixins.py index 4d4b7d254..004f3cef7 100644 --- a/feature_engine/_base_transformers/mixins.py +++ b/feature_engine/_base_transformers/mixins.py @@ -1,4 +1,4 @@ -from typing import Dict, List, Union +from typing import Dict, List, Tuple, Union import pandas as pd from numpy import ndarray @@ -46,7 +46,9 @@ def transform_x_y(self, X: pd.DataFrame, y: pd.Series): class FitFromDictMixin: - def _fit_from_dict(self, X: pd.DataFrame, user_dict_: Dict) -> pd.DataFrame: + def _fit_from_dict( + self, X: pd.DataFrame, user_dict_: Dict + ) -> Tuple[pd.DataFrame, List[Union[str, int]]]: """ Checks that input is a dataframe, checks that variables in the dictionary entered by the user are of type numerical. @@ -71,25 +73,22 @@ def _fit_from_dict(self, X: pd.DataFrame, user_dict_: Dict) -> pd.DataFrame: ------- X : Pandas DataFrame The same dataframe entered as parameter + + variables_ : List + The variables in the dictionary. """ # check input dataframe X = check_X(X) # find or check for numerical variables variables = list(user_dict_.keys()) - self.variables_ = check_numerical_variables(X, variables) + variables_ = check_numerical_variables(X, variables) # check if dataset contains na or inf - _check_contains_na(X, self.variables_) - _check_contains_inf(X, self.variables_) - - # save input features - self.feature_names_in_ = X.columns.tolist() - - # save train set shape - self.n_features_in_ = X.shape[1] + _check_contains_na(X, variables_) + _check_contains_inf(X, variables_) - return X + return X, variables_ class GetFeatureNamesOutMixin: diff --git a/feature_engine/creation/cyclical_features.py b/feature_engine/creation/cyclical_features.py index 40e96cab7..849c71a3d 100644 --- a/feature_engine/creation/cyclical_features.py +++ b/feature_engine/creation/cyclical_features.py @@ -147,11 +147,15 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): It is not needed in this transformer. You can pass y or None. """ if self.max_values is None: - X = super().fit(X) - self.max_values_ = X[self.variables_].max().to_dict() + X, variables_ = self._fit_setup(X) + max_values_ = X[variables_].max().to_dict() else: - super()._fit_from_dict(X, self.max_values) - self.max_values_ = self.max_values + X, variables_ = super()._fit_from_dict(X, self.max_values) + max_values_ = self.max_values + + self.variables_ = variables_ + self.max_values_ = max_values_ + self._get_feature_names_in(X) return self diff --git a/feature_engine/creation/decision_tree_features.py b/feature_engine/creation/decision_tree_features.py index 8ec2030aa..e026b1278 100644 --- a/feature_engine/creation/decision_tree_features.py +++ b/feature_engine/creation/decision_tree_features.py @@ -260,7 +260,7 @@ def fit(self, X: pd.DataFrame, y: pd.Series): y: pandas Series or np.array = [n_samples,] The target variable that is used to train the decision tree. """ - # confirm model type and target variables are compatible. + X, y = check_X_y(X, y) if self.regression is True: if type_of_target(y) == "binary": raise ValueError( @@ -268,11 +268,10 @@ def fit(self, X: pd.DataFrame, y: pd.Series): "allowed by this transformer. Check the target values " "or set regression to False." ) + is_binary = None else: check_classification_targets(y) - self._is_binary = type_of_target(y) - - X, y = check_X_y(X, y) + is_binary = type_of_target(y) # find or check for numerical variables if self.variables is None: @@ -280,7 +279,6 @@ def fit(self, X: pd.DataFrame, y: pd.Series): else: variables_ = check_numerical_variables(X, self.variables) - # check if dataset contains na or inf _check_contains_na(X, variables_) _check_contains_inf(X, variables_) @@ -289,7 +287,6 @@ def fit(self, X: pd.DataFrame, y: pd.Series): else: param_grid = {"max_depth": [1, 2, 3, 4]} - # get the sets of variables that will be used to create new features input_features = self._create_variable_combinations( how_to_combine=self.features_to_combine, variables=variables_ ) @@ -298,7 +295,6 @@ def fit(self, X: pd.DataFrame, y: pd.Series): for features in input_features: estimator = self._make_decision_tree(param_grid=param_grid) - # single feature models if isinstance(features, str): estimator.fit(X[features].to_frame(), y) # multi feature models @@ -310,6 +306,7 @@ def fit(self, X: pd.DataFrame, y: pd.Series): self.variables_ = variables_ self.input_features_ = input_features self.estimators_ = estimators_ + self._is_binary = is_binary self.feature_names_in_ = X.columns.tolist() self.n_features_in_ = X.shape[1] @@ -330,24 +327,17 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: Either the original dataframe plus the new features or a dataframe of only the new features. """ - # Check method fit has been called check_is_fitted(self) - # check that input is a dataframe X = check_X(X) - # Check if input data contains same number of columns as dataframe used to fit. _check_X_matches_training_df(X, self.n_features_in_) - # check if dataset contains na or inf _check_contains_na(X, self.variables_) _check_contains_inf(X, self.variables_) - # reorder variables to match train set X = X[self.feature_names_in_] - # create new features and add them to the original dataframe - # if regression or multiclass, we return the output of predict() if self.regression is True: for features, estimator in zip(self.input_features_, self.estimators_): if isinstance(features, str): @@ -361,7 +351,6 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: preds = np.round(preds, self.precision) X.loc[:, f"tree({features})"] = preds - # if binary classification, we return the probability elif self._is_binary == "binary": for features, estimator in zip(self.input_features_, self.estimators_): if isinstance(features, str): @@ -375,7 +364,6 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: preds = np.round(preds, self.precision) X.loc[:, f"tree({features})"] = preds[:, 1] - # if multiclass, we return the output of predict() else: for features, estimator in zip(self.input_features_, self.estimators_): if isinstance(features, str): @@ -437,7 +425,6 @@ def _create_variable_combinations( else: combos.append(list(feature)) - # if output_features is None, int or list. else: if how_to_combine is None: if len(variables) == 1: @@ -452,7 +439,6 @@ def _create_variable_combinations( els = [list(x) for x in itertools.combinations(variables, i)] combos += els - # output_feature is a list else: for i in how_to_combine: els = [list(x) for x in itertools.combinations(variables, i)] @@ -465,7 +451,6 @@ def _get_new_features_name(self) -> List: feature_names = [f"tree({combo})" for combo in self.input_features_] return feature_names - # for the check_estimator tests def _more_tags(self): tags_dict = _return_tags() tags_dict["requires_y"] = True diff --git a/feature_engine/creation/geo_features.py b/feature_engine/creation/geo_features.py index 568ed12c4..753a7a51f 100644 --- a/feature_engine/creation/geo_features.py +++ b/feature_engine/creation/geo_features.py @@ -234,8 +234,7 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): # check input dataframe X = check_X(X) - # Store coordinate variables - self.variables_: List[Union[str, int]] = [ + variables: List[Union[str, int]] = [ self.lat1, self.lon1, self.lat2, @@ -243,17 +242,17 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): ] # Check all coordinate columns exist - missing = set(self.variables_) - set(X.columns) + missing = set(variables) - set(X.columns) if missing: raise ValueError( f"Coordinate columns {missing} are not present in the dataframe." ) # Check coordinate columns are numerical - check_numerical_variables(X, self.variables_) + check_numerical_variables(X, variables) # Check for missing values - _check_contains_na(X, self.variables_) + _check_contains_na(X, variables) # Validate coordinate ranges if enabled if self.validate_ranges: @@ -269,6 +268,9 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): f"Longitude values in '{lon_col}' must be between -180 and 180." ) + # save coordinate variables + self.variables_ = variables + # save input features self.feature_names_in_ = X.columns.tolist() diff --git a/feature_engine/discretisation/arbitrary.py b/feature_engine/discretisation/arbitrary.py index 44d35ecdf..6a3e9c468 100644 --- a/feature_engine/discretisation/arbitrary.py +++ b/feature_engine/discretisation/arbitrary.py @@ -151,10 +151,12 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): y is not needed in this transformer. You can pass y or None. """ # check input dataframe - X = super()._fit_from_dict(X, self.binning_dict) + X, variables_ = super()._fit_from_dict(X, self.binning_dict) # for consistency wit the rest of the discretisers, we add this attribute + self.variables_ = variables_ self.binner_dict_ = self.binning_dict + self._get_feature_names_in(X) return self diff --git a/feature_engine/discretisation/decision_tree.py b/feature_engine/discretisation/decision_tree.py index af691e4aa..648771a1e 100644 --- a/feature_engine/discretisation/decision_tree.py +++ b/feature_engine/discretisation/decision_tree.py @@ -214,7 +214,7 @@ def __init__( self.param_grid = param_grid self.random_state = random_state - def fit(self, X: pd.DataFrame, y: pd.Series): # type: ignore + def fit(self, X: pd.DataFrame, y: pd.Series): """ Fit one decision tree per variable to discretize with cross-validation and grid-search for hyperparameters. @@ -241,7 +241,7 @@ def fit(self, X: pd.DataFrame, y: pd.Series): # type: ignore check_classification_targets(y) # check input dataframe - X = super().fit(X) + X, variables_ = self._fit_setup(X) if self.param_grid: param_grid = self.param_grid @@ -251,7 +251,7 @@ def fit(self, X: pd.DataFrame, y: pd.Series): # type: ignore binner_dict_ = {} scores_dict_ = {} - for var in self.variables_: + for var in variables_: if self.regression: model = DecisionTreeRegressor(random_state=self.random_state) @@ -269,7 +269,7 @@ def fit(self, X: pd.DataFrame, y: pd.Series): # type: ignore scores_dict_[var] = tree_model.score(X[var].to_frame(), y) if self.bin_output != "prediction": - for var in self.variables_: + for var in variables_: clf = binner_dict_[var].best_estimator_ threshold = clf.tree_.threshold feature = clf.tree_.feature @@ -280,6 +280,9 @@ def fit(self, X: pd.DataFrame, y: pd.Series): # type: ignore self.binner_dict_ = binner_dict_ self.scores_dict_ = scores_dict_ + self.variables_ = variables_ + self._get_feature_names_in(X) + return self def transform(self, X: pd.DataFrame) -> pd.DataFrame: diff --git a/feature_engine/discretisation/equal_frequency.py b/feature_engine/discretisation/equal_frequency.py index 9060f1d49..841f3edb3 100644 --- a/feature_engine/discretisation/equal_frequency.py +++ b/feature_engine/discretisation/equal_frequency.py @@ -159,17 +159,20 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ # check input dataframe - X = super().fit(X) + X, variables_ = self._fit_setup(X) - self.binner_dict_ = {} + binner_dict_ = {} - for var in self.variables_: + for var in variables_: tmp, bins = pd.qcut(x=X[var], q=self.q, retbins=True, duplicates="drop") # Prepend/Append infinities to accommodate outliers bins = list(bins) bins[0] = float("-inf") bins[len(bins) - 1] = float("inf") - self.binner_dict_[var] = bins + binner_dict_[var] = bins + self.binner_dict_ = binner_dict_ + self.variables_ = variables_ + self._get_feature_names_in(X) return self diff --git a/feature_engine/discretisation/equal_width.py b/feature_engine/discretisation/equal_width.py index 03787835d..0bfd2b4bc 100644 --- a/feature_engine/discretisation/equal_width.py +++ b/feature_engine/discretisation/equal_width.py @@ -168,12 +168,12 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ # check input dataframe - X = super().fit(X) + X, variables_ = self._fit_setup(X) # fit - self.binner_dict_ = {} + binner_dict_ = {} - for var in self.variables_: + for var in variables_: tmp, bins = pd.cut( x=X[var], bins=self.bins, @@ -186,6 +186,9 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): bins = list(bins) bins[0] = float("-inf") bins[len(bins) - 1] = float("inf") - self.binner_dict_[var] = bins + binner_dict_[var] = bins + self.binner_dict_ = binner_dict_ + self.variables_ = variables_ + self._get_feature_names_in(X) return self diff --git a/feature_engine/discretisation/geometric_width.py b/feature_engine/discretisation/geometric_width.py index 9f7c37d21..7da28261a 100644 --- a/feature_engine/discretisation/geometric_width.py +++ b/feature_engine/discretisation/geometric_width.py @@ -159,12 +159,12 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ # check input dataframe - X = super().fit(X) + X, variables_ = self._fit_setup(X) # fit - self.binner_dict_ = {} + binner_dict_ = {} - for var in self.variables_: + for var in variables_: min_, max_ = X[var].min(), X[var].max() increment = np.power(max_ - min_, 1.0 / self.bins) bins = np.r_[ @@ -172,6 +172,10 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): ] bins = np.sort(bins) bins = list(bins) - self.binner_dict_[var] = bins + binner_dict_[var] = bins + + self.variables_ = variables_ + self.binner_dict_ = binner_dict_ + self._get_feature_names_in(X) return self diff --git a/feature_engine/imputation/arbitrary_number.py b/feature_engine/imputation/arbitrary_number.py index 668f391b0..e16c62ae2 100644 --- a/feature_engine/imputation/arbitrary_number.py +++ b/feature_engine/imputation/arbitrary_number.py @@ -149,17 +149,19 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): # find or check for numerical variables # create the imputer dictionary if self.imputer_dict: - self.variables_ = check_numerical_variables( + variables_ = check_numerical_variables( X, list(self.imputer_dict.keys()) ) - self.imputer_dict_ = self.imputer_dict + imputer_dict_ = self.imputer_dict else: if self.variables is None: - self.variables_ = find_numerical_variables(X) + variables_ = find_numerical_variables(X) else: - self.variables_ = check_numerical_variables(X, self.variables) - self.imputer_dict_ = {var: self.arbitrary_number for var in self.variables_} + variables_ = check_numerical_variables(X, self.variables) + imputer_dict_ = {var: self.arbitrary_number for var in variables_} + self.variables_ = variables_ + self.imputer_dict_ = imputer_dict_ self._get_feature_names_in(X) return self diff --git a/feature_engine/imputation/categorical.py b/feature_engine/imputation/categorical.py index 8c4000a0c..f329c3b44 100644 --- a/feature_engine/imputation/categorical.py +++ b/feature_engine/imputation/categorical.py @@ -169,22 +169,22 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): # select variables to encode if self.ignore_format is True: if self.variables is None: - self.variables_ = find_all_variables(X) + variables_ = find_all_variables(X) else: - self.variables_ = check_all_variables(X, self.variables) + variables_ = check_all_variables(X, self.variables) else: if self.variables is None: - self.variables_ = find_categorical_variables(X) + variables_ = find_categorical_variables(X) else: - self.variables_ = check_categorical_variables(X, self.variables) + variables_ = check_categorical_variables(X, self.variables) if self.imputation_method == "missing": - self.imputer_dict_ = {var: self.fill_value for var in self.variables_} + imputer_dict_ = {var: self.fill_value for var in variables_} elif self.imputation_method == "frequent": # if imputing only 1 variable: - if len(self.variables_) == 1: - var = self.variables_[0] + if len(variables_) == 1: + var = variables_[0] mode_vals = X[var].mode() # Some variables may contain more than 1 mode: @@ -193,13 +193,13 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): f"The variable {var} contains multiple frequent categories." ) - self.imputer_dict_ = {var: mode_vals[0]} + imputer_dict_ = {var: mode_vals[0]} # imputing multiple variables: else: # Returns a dataframe with 1 row if there is one mode per # variable, or more rows if there are more modes: - mode_vals = X[self.variables_].mode() + mode_vals = X[variables_].mode() # Careful: some variables contain multiple modes if len(mode_vals) > 1: @@ -213,8 +213,10 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): f"categories." ) - self.imputer_dict_ = mode_vals.iloc[0].to_dict() + imputer_dict_ = mode_vals.iloc[0].to_dict() + self.variables_ = variables_ + self.imputer_dict_ = imputer_dict_ self._get_feature_names_in(X) return self diff --git a/feature_engine/imputation/drop_missing_data.py b/feature_engine/imputation/drop_missing_data.py index 07c6f3e75..12de75478 100644 --- a/feature_engine/imputation/drop_missing_data.py +++ b/feature_engine/imputation/drop_missing_data.py @@ -150,16 +150,17 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): # find variables for which indicator should be added if self.variables is None: - self.variables_ = find_all_variables(X) + variables_ = find_all_variables(X) else: - self.variables_ = check_all_variables(X, self.variables) + variables_ = check_all_variables(X, self.variables) # If user passes a threshold, then missing_only is ignored: if self.threshold is None and self.missing_only is True: - self.variables_ = [ - var for var in self.variables_ if X[var].isnull().sum() > 0 + variables_ = [ + var for var in variables_ if X[var].isnull().sum() > 0 ] + self.variables_ = variables_ self._get_feature_names_in(X) return self diff --git a/feature_engine/imputation/end_tail.py b/feature_engine/imputation/end_tail.py index 59e59f32a..a5ffda663 100644 --- a/feature_engine/imputation/end_tail.py +++ b/feature_engine/imputation/end_tail.py @@ -177,35 +177,37 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): # find or check for numerical variables if self.variables is None: - self.variables_ = find_numerical_variables(X) + variables_ = find_numerical_variables(X) else: - self.variables_ = check_numerical_variables(X, self.variables) + variables_ = check_numerical_variables(X, self.variables) # estimate imputation values if self.imputation_method == "max": - self.imputer_dict_ = (X[self.variables_].max() * self.fold).to_dict() + imputer_dict_ = (X[variables_].max() * self.fold).to_dict() elif self.imputation_method == "gaussian": if self.tail == "right": - self.imputer_dict_ = ( - X[self.variables_].mean() + self.fold * X[self.variables_].std() + imputer_dict_ = ( + X[variables_].mean() + self.fold * X[variables_].std() ).to_dict() elif self.tail == "left": - self.imputer_dict_ = ( - X[self.variables_].mean() - self.fold * X[self.variables_].std() + imputer_dict_ = ( + X[variables_].mean() - self.fold * X[variables_].std() ).to_dict() elif self.imputation_method == "iqr": - IQR = X[self.variables_].quantile(0.75) - X[self.variables_].quantile(0.25) + IQR = X[variables_].quantile(0.75) - X[variables_].quantile(0.25) if self.tail == "right": - self.imputer_dict_ = ( - X[self.variables_].quantile(0.75) + (IQR * self.fold) + imputer_dict_ = ( + X[variables_].quantile(0.75) + (IQR * self.fold) ).to_dict() elif self.tail == "left": - self.imputer_dict_ = ( - X[self.variables_].quantile(0.25) - (IQR * self.fold) + imputer_dict_ = ( + X[variables_].quantile(0.25) - (IQR * self.fold) ).to_dict() + self.variables_ = variables_ + self.imputer_dict_ = imputer_dict_ self._get_feature_names_in(X) return self diff --git a/feature_engine/imputation/mean_median.py b/feature_engine/imputation/mean_median.py index da845e063..3f3baee0a 100644 --- a/feature_engine/imputation/mean_median.py +++ b/feature_engine/imputation/mean_median.py @@ -127,17 +127,19 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): # find or check for numerical variables if self.variables is None: - self.variables_ = find_numerical_variables(X) + variables_ = find_numerical_variables(X) else: - self.variables_ = check_numerical_variables(X, self.variables) + variables_ = check_numerical_variables(X, self.variables) # find imputation parameters: mean or median if self.imputation_method == "mean": - self.imputer_dict_ = X[self.variables_].mean().to_dict() + imputer_dict_ = X[variables_].mean().to_dict() elif self.imputation_method == "median": - self.imputer_dict_ = X[self.variables_].median().to_dict() + imputer_dict_ = X[variables_].median().to_dict() + self.variables_ = variables_ + self.imputer_dict_ = imputer_dict_ self._get_feature_names_in(X) return self diff --git a/feature_engine/imputation/missing_indicator.py b/feature_engine/imputation/missing_indicator.py index 01660a654..a902230c9 100644 --- a/feature_engine/imputation/missing_indicator.py +++ b/feature_engine/imputation/missing_indicator.py @@ -129,15 +129,16 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): # find variables for which indicator should be added if self.variables is None: - self.variables_ = find_all_variables(X) + variables_ = find_all_variables(X) else: - self.variables_ = check_all_variables(X, self.variables) + variables_ = check_all_variables(X, self.variables) if self.missing_only is True: - self.variables_ = [ - var for var in self.variables_ if X[var].isnull().sum() > 0 + variables_ = [ + var for var in variables_ if X[var].isnull().sum() > 0 ] + self.variables_ = variables_ self._get_feature_names_in(X) return self diff --git a/feature_engine/imputation/random_sample.py b/feature_engine/imputation/random_sample.py index d05aeaac8..f319242e5 100644 --- a/feature_engine/imputation/random_sample.py +++ b/feature_engine/imputation/random_sample.py @@ -184,26 +184,29 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): # find variables to impute if self.variables is None: - self.variables_ = find_all_variables(X) + variables_ = find_all_variables(X) else: - self.variables_ = check_all_variables(X, self.variables) + variables_ = check_all_variables(X, self.variables) # take a copy of the selected variables - self.X_ = X[self.variables_].copy() + X_ = X[variables_].copy() # check the variables assigned to the random state if self.seed == "observation": - self.random_state = _check_variables_input_value(self.random_state) - if isinstance(self.random_state, (int, str)): - self.random_state = [self.random_state] - if self.random_state and any( - var for var in self.random_state if var not in X.columns + random_state = _check_variables_input_value(self.random_state) + if isinstance(random_state, (int, str)): + random_state = [random_state] + if random_state and any( + var for var in random_state if var not in X.columns ): raise ValueError( - "There are variables assigned as random state which are not part " - "of the training dataframe." + "One or more of the variables indicated in random_state " + "is not present in the dataframe." ) + self.random_state = random_state + self.variables_ = variables_ + self.X_ = X_ self._get_feature_names_in(X) return self diff --git a/feature_engine/scaling/mean_normalization.py b/feature_engine/scaling/mean_normalization.py index 78f4a958c..93c9ae171 100644 --- a/feature_engine/scaling/mean_normalization.py +++ b/feature_engine/scaling/mean_normalization.py @@ -120,18 +120,24 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ # check input dataframe - X = super().fit(X) - self.mean_ = X[self.variables_].mean().to_dict() - self.range_ = (X[self.variables_].max() - X[self.variables_].min()).to_dict() + X, variables_ = self._fit_setup(X) + + mean_ = X[variables_].mean().to_dict() + range_ = (X[variables_].max() - X[variables_].min()).to_dict() # check for constant columns - constant_columns = [col for col, value in self.range_.items() if value == 0] + constant_columns = [col for col, value in range_.items() if value == 0] if constant_columns: raise ValueError( f"The following variable(s) are constant: {constant_columns}. " "Division by zero is not allowed. Please remove constant columns." ) + self.variables_ = variables_ + self.mean_ = mean_ + self.range_ = range_ + self._get_feature_names_in(X) + return self def transform(self, X: pd.DataFrame) -> pd.DataFrame: diff --git a/feature_engine/transformation/arcsin.py b/feature_engine/transformation/arcsin.py index 059df813e..cc6cc8c49 100644 --- a/feature_engine/transformation/arcsin.py +++ b/feature_engine/transformation/arcsin.py @@ -121,15 +121,18 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ # check input dataframe - X = super().fit(X) + X, variables_ = self._fit_setup(X) # check if the variables are in the correct range - if ((X[self.variables_] < 0) | (X[self.variables_] > 1)).any().any(): + if ((X[variables_] < 0) | (X[variables_] > 1)).any().any(): raise ValueError( "Some variables contain values outside the possible range 0-1. " "Can't apply the arcsin transformation. " ) + self.variables_ = variables_ + self._get_feature_names_in(X) + return self def transform(self, X: pd.DataFrame) -> pd.DataFrame: diff --git a/feature_engine/transformation/arcsinh.py b/feature_engine/transformation/arcsinh.py index e0020ff86..70e627a23 100644 --- a/feature_engine/transformation/arcsinh.py +++ b/feature_engine/transformation/arcsinh.py @@ -161,7 +161,9 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ # check input dataframe and find/check numerical variables - X = super().fit(X) + X, variables_ = self._fit_setup(X) + self.variables_ = variables_ + self._get_feature_names_in(X) return self diff --git a/feature_engine/transformation/boxcox.py b/feature_engine/transformation/boxcox.py index 1541ff8b5..b1e817b7c 100644 --- a/feature_engine/transformation/boxcox.py +++ b/feature_engine/transformation/boxcox.py @@ -135,12 +135,16 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ # check input dataframe - X = super().fit(X) + X, variables_ = self._fit_setup(X) - self.lambda_dict_ = {} + lambda_dict_ = {} - for var in self.variables_: - _, self.lambda_dict_[var] = stats.boxcox(X[var]) + for var in variables_: + _, lambda_dict_[var] = stats.boxcox(X[var]) + + self.variables_ = variables_ + self.lambda_dict_ = lambda_dict_ + self._get_feature_names_in(X) return self diff --git a/feature_engine/transformation/log.py b/feature_engine/transformation/log.py index 695243291..e71c7c25e 100644 --- a/feature_engine/transformation/log.py +++ b/feature_engine/transformation/log.py @@ -127,15 +127,17 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): It is not needed in this transformer. You can pass y or None. """ - # check input dataframe - X = super().fit(X) + X, variables_ = self._fit_setup(X) # check contains zero or negative values - if (X[self.variables_] <= 0).any().any(): + if (X[variables_] <= 0).any().any(): raise ValueError( "Some variables contain zero or negative values, can't apply log" ) + self.variables_ = variables_ + self._get_feature_names_in(X) + return self def transform(self, X: pd.DataFrame) -> pd.DataFrame: @@ -356,23 +358,34 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): # check input dataframe if isinstance(self.C, dict): - X = super()._fit_from_dict(X, self.C) + X, variables_ = super()._fit_from_dict(X, self.C) else: - X = super().fit(X) - - self.C_ = self.C + X, variables_ = self._fit_setup(X) # calculate C to add to each variable + C_: Union[int, float, Dict[Union[str, int], Union[float, int]]] if self.C == "auto": # we add 0 to positive variables - c_dict = {var: 0 for var in self.variables_ if X[var].min() > 0} + c_dict: Dict[Union[str, int], Union[float, int]] = { + var: 0.0 for var in variables_ if X[var].min() > 0 + } # we add the minimum plus 1 to non-positive variables - non_positive_vars = [ - var for var in self.variables_ if var not in c_dict.keys() - ] - c_dict.update(dict(X[non_positive_vars].min(axis=0).abs() + 1)) - self.C_ = c_dict # type:ignore + non_positive_vars = [var for var in variables_ if var not in c_dict.keys()] + if non_positive_vars: + c_dict.update(dict(X[non_positive_vars].min(axis=0).abs() + 1)) + C_ = c_dict + elif isinstance(self.C, (int, float, dict)): + C_ = self.C + else: + raise ValueError( + f"C can take only 'auto', integers, floats or dicts. " + f"Got {self.C} instead." + ) + + self.variables_ = variables_ + self.C_ = C_ + self._get_feature_names_in(X) return self diff --git a/feature_engine/transformation/power.py b/feature_engine/transformation/power.py index ae10a16bf..12e737b64 100644 --- a/feature_engine/transformation/power.py +++ b/feature_engine/transformation/power.py @@ -121,7 +121,9 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ # check input dataframe - super().fit(X) + X, variables_ = self._fit_setup(X) + self.variables_ = variables_ + self._get_feature_names_in(X) return self diff --git a/feature_engine/transformation/reciprocal.py b/feature_engine/transformation/reciprocal.py index d51557331..530d94eec 100644 --- a/feature_engine/transformation/reciprocal.py +++ b/feature_engine/transformation/reciprocal.py @@ -112,15 +112,18 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ # check input dataframe - X = super().fit(X) + X, variables_ = self._fit_setup(X) # check if the variables contain the value 0 - if (X[self.variables_] == 0).any().any(): + if (X[variables_] == 0).any().any(): raise ValueError( "Some variables contain the value zero, can't apply reciprocal " "transformation." ) + self.variables_ = variables_ + self._get_feature_names_in(X) + return self def transform(self, X: pd.DataFrame) -> pd.DataFrame: diff --git a/feature_engine/transformation/yeojohnson.py b/feature_engine/transformation/yeojohnson.py index f8d938e4a..0103ceac7 100644 --- a/feature_engine/transformation/yeojohnson.py +++ b/feature_engine/transformation/yeojohnson.py @@ -128,12 +128,16 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ # check input dataframe - X = super().fit(X) + X, variables_ = self._fit_setup(X) - self.lambda_dict_ = {} + lambda_dict_ = {} - for var in self.variables_: - _, self.lambda_dict_[var] = stats.yeojohnson(X[var]) + for var in variables_: + _, lambda_dict_[var] = stats.yeojohnson(X[var]) + + self.variables_ = variables_ + self.lambda_dict_ = lambda_dict_ + self._get_feature_names_in(X) return self diff --git a/tests/test_base_transformers/test_base_numerical_transformer.py b/tests/test_base_transformers/test_base_numerical_transformer.py index 1629ab67e..aa48aa8a7 100644 --- a/tests/test_base_transformers/test_base_numerical_transformer.py +++ b/tests/test_base_transformers/test_base_numerical_transformer.py @@ -7,8 +7,14 @@ class MockClass(BaseNumericalTransformer): - def __init__(self): - self.variables = None + def __init__(self, variables=None): + self.variables = variables + + def fit(self, X, y=None): + X, variables_ = self._fit_setup(X) + self.variables_ = variables_ + self._get_feature_names_in(X) + return X def transform(self, X): return self._check_transform_input_and_state(X) diff --git a/tests/test_creation/test_check_estimator_creation.py b/tests/test_creation/test_check_estimator_creation.py index e3c22caa1..dcfeaede6 100644 --- a/tests/test_creation/test_check_estimator_creation.py +++ b/tests/test_creation/test_check_estimator_creation.py @@ -1,3 +1,6 @@ +from sklearn import clone +from sklearn.exceptions import NotFittedError + import pandas as pd import pytest import sklearn @@ -16,10 +19,6 @@ sklearn_version = parse_version(parse_version(sklearn.__version__).base_version) -# Estimators for sklearn's check_estimator -# Note: GeoDistanceFeatures is not included here because it requires 4 specific -# named coordinate columns, but sklearn's check_estimator generates test data -# with generic column names (x0, x1, x2) that don't match the required columns. _estimators = [ MathFeatures(variables=["x0", "x1"], func="mean", missing_values="ignore"), RelativeFeatures( @@ -97,3 +96,44 @@ def test_geo_distance_transformer_in_pipeline(): Xtp = pipe.fit_transform(X.copy(), y) pd.testing.assert_frame_equal(Xtt, Xtp) + + +@pytest.mark.parametrize( + "estimator", + [ + CyclicalFeatures(), + MathFeatures( + variables=["feature_1", "feature_2"], func=["sum", "mean"] + ), + RelativeFeatures( + variables=["feature_1"], reference=["feature_2"], func=["div"] + ), + DecisionTreeFeatures(regression=False), + GeoDistanceFeatures( + lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2" + ), + ], +) +def test_raises_non_fitted_error_when_error_during_fit(estimator): + estimator = clone(estimator) + + X = pd.DataFrame({"cat1": ["a", "b", "c", "a", "b"]}) + y = pd.Series([0, 1, 0, 1, 0]) + + if hasattr(estimator, "variables") and estimator.variables: + X = pd.DataFrame( + {var: ["a", "b", "c", "a", "b"] for var in estimator.variables} + ) + elif isinstance(estimator, GeoDistanceFeatures): + X = pd.DataFrame({ + "lat1": ["a", "b"], + "lon1": ["c", "d"], + "lat2": ["e", "f"], + "lon2": ["g", "h"], + }) + + with pytest.raises((ValueError, TypeError, KeyError)): + estimator.fit(X, y) + + with pytest.raises(NotFittedError): + estimator.transform(X) diff --git a/tests/test_discretisation/test_check_estimator_discretisers.py b/tests/test_discretisation/test_check_estimator_discretisers.py index 87e175eac..d151c3080 100644 --- a/tests/test_discretisation/test_check_estimator_discretisers.py +++ b/tests/test_discretisation/test_check_estimator_discretisers.py @@ -1,3 +1,6 @@ +from sklearn import clone +from sklearn.exceptions import NotFittedError + import numpy as np import pandas as pd import pytest @@ -63,3 +66,17 @@ def test_transformers_within_pipeline(transformer): Xtp = pipe.fit_transform(X, y) pd.testing.assert_frame_equal(Xtt, Xtp) + + +@pytest.mark.parametrize("estimator", _estimators) +def test_raises_non_fitted_error_when_error_during_fit(estimator): + estimator = clone(estimator) + + X = pd.DataFrame({"cat1": ["a", "b", "c", "a", "b"]}) + y = pd.Series([0, 1, 0, 1, 0]) + + with pytest.raises((ValueError, TypeError, KeyError)): + estimator.fit(X, y) + + with pytest.raises(NotFittedError): + estimator.transform(X) diff --git a/tests/test_imputation/test_check_estimator_imputers.py b/tests/test_imputation/test_check_estimator_imputers.py index 0091c7bf7..e9017309a 100644 --- a/tests/test_imputation/test_check_estimator_imputers.py +++ b/tests/test_imputation/test_check_estimator_imputers.py @@ -1,3 +1,6 @@ +from sklearn import clone +from sklearn.exceptions import NotFittedError + import pandas as pd import pytest import sklearn @@ -69,3 +72,28 @@ def test_transformers_in_pipeline_with_set_output_pandas(transformer): Xtp = pipe.fit_transform(X, y) pd.testing.assert_frame_equal(Xtt, Xtp) + + +@pytest.mark.parametrize("estimator", _estimators) +def test_raises_non_fitted_error_when_error_during_fit(estimator): + estimator = clone(estimator) + + if estimator.__class__.__name__ in [ + "MeanMedianImputer", + "EndTailImputer", + "ArbitraryNumberImputer", + ]: + X = pd.DataFrame({"cat1": ["a", "b", "c", "a", "b"]}) + + elif estimator.__class__.__name__ == "CategoricalImputer": + estimator.set_params(ignore_format=False) + X = pd.DataFrame({"num1": [1.0, 2.0, 3.0, 4.0, 5.0]}) + + else: + X = pd.DataFrame() + + with pytest.raises((ValueError, TypeError)): + estimator.fit(X) + + with pytest.raises(NotFittedError): + estimator.transform(X) diff --git a/tests/test_scaling/test_mean_normalization.py b/tests/test_scaling/test_mean_normalization.py index 240cb7d3f..1a411ea1b 100644 --- a/tests/test_scaling/test_mean_normalization.py +++ b/tests/test_scaling/test_mean_normalization.py @@ -126,3 +126,21 @@ def test_constant_columns_error(): transformer = MeanNormalizationScaler() with pytest.raises(ValueError, match=re.escape("Division by zero is not allowed")): transformer.fit(df) + + +def test_raises_non_fitted_error_when_error_during_fit(): + # input test case + df = pd.DataFrame( + { + "var1": [1.0, 2.0, 3.0], + "var2": [4.0, 5.0, 3.0], + "var3": [7.0, 7.0, 7.0], + } + ) + + transformer = MeanNormalizationScaler() + with pytest.raises(ValueError, match=re.escape("Division by zero is not allowed")): + transformer.fit(df) + + with pytest.raises(NotFittedError): + transformer.transform(df) diff --git a/tests/test_transformation/test_check_estimator_transformers.py b/tests/test_transformation/test_check_estimator_transformers.py index 8f482e10d..5ab17b7d6 100644 --- a/tests/test_transformation/test_check_estimator_transformers.py +++ b/tests/test_transformation/test_check_estimator_transformers.py @@ -1,3 +1,6 @@ +from sklearn import clone +from sklearn.exceptions import NotFittedError + import pandas as pd import pytest import sklearn @@ -95,3 +98,25 @@ def test_transformers_in_pipeline_with_set_output_pandas(transformer): Xtp = pipe.fit_transform(X, y) pd.testing.assert_frame_equal(Xtt, Xtp) + + +@pytest.mark.parametrize("estimator", _estimators) +def test_raises_non_fitted_error_when_error_during_fit(estimator): + estimator = clone(estimator) + + if estimator.__class__.__name__ == "BoxCoxTransformer": + X = pd.DataFrame({"num1": [-1.0, 2.0, 3.0, 4.0, 5.0]}) + elif estimator.__class__.__name__ == "ArcsinTransformer": + X = pd.DataFrame({"num1": [1.1, 2.0, 3.0, 4.0, 5.0]}) + elif estimator.__class__.__name__ == "LogTransformer": + X = pd.DataFrame({"num1": [-1.0, 2.0, 3.0, 4.0, 5.0]}) + elif estimator.__class__.__name__ == "ReciprocalTransformer": + X = pd.DataFrame({"num1": [0.0, 2.0, 3.0, 4.0, 5.0]}) + else: + X = pd.DataFrame() + + with pytest.raises((ValueError, TypeError)): + estimator.fit(X) + + with pytest.raises(NotFittedError): + estimator.transform(X)