From fb230feeb6acc8b5da972133750eeb5278d0f7c2 Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Sun, 8 Mar 2026 20:00:45 +0530 Subject: [PATCH 01/32] feat(CategoricalImputer): add errors param to handle multimodal variables (#904) --- docs/whats_new/v_190.rst | 1 + feature_engine/imputation/categorical.py | 54 +++++++++++-- .../test_categorical_imputer.py | 77 ++++++++++++++++++- 3 files changed, 122 insertions(+), 10 deletions(-) diff --git a/docs/whats_new/v_190.rst b/docs/whats_new/v_190.rst index 3ee3222fb..f1b6e22da 100644 --- a/docs/whats_new/v_190.rst +++ b/docs/whats_new/v_190.rst @@ -53,6 +53,7 @@ New transformers Enhancements ~~~~~~~~~~~~ +- Added `errors` parameter to `CategoricalImputer` to handle categorical variables with multiple frequent categories instead of automatically raising a `ValueError`. (`DirekKakkar `_) - Our variable handling functions now return empty lists when no variables of the desired type are found. (`Soledad Galli `_) BUG diff --git a/feature_engine/imputation/categorical.py b/feature_engine/imputation/categorical.py index 8c4000a0c..40c0a1276 100644 --- a/feature_engine/imputation/categorical.py +++ b/feature_engine/imputation/categorical.py @@ -2,6 +2,7 @@ # License: BSD 3 clause from typing import List, Optional, Union +import warnings import pandas as pd @@ -88,6 +89,18 @@ class CategoricalImputer(BaseImputer): type object or categorical. If True, the imputer will select all variables or accept all variables entered by the user, including those cast as numeric. + errors : str, default='raise' + Indicates what to do when the selected imputation_method='frequent' + and a variable has more than 1 mode. + + If 'raise', raises a ValueError and stops the fit. + + If 'warn', raises a UserWarning and continues, imputing using the + first most frequent category found. + + If 'ignore', continues without warnings, imputing using the first + most frequent category found. + Attributes ---------- {imputer_dict_} @@ -135,6 +148,7 @@ def __init__( variables: Union[None, int, str, List[Union[str, int]]] = None, return_object: bool = False, ignore_format: bool = False, + errors: str = "raise", ) -> None: if imputation_method not in ["missing", "frequent"]: raise ValueError( @@ -144,11 +158,18 @@ def __init__( if not isinstance(ignore_format, bool): raise ValueError("ignore_format takes only booleans True and False") + if errors not in ("raise", "warn", "ignore"): + raise ValueError( + "errors takes only values 'raise', 'warn', or 'ignore'. " + f"Got {errors} instead." + ) + self.imputation_method = imputation_method self.fill_value = fill_value self.variables = _check_variables_input_value(variables) self.return_object = return_object self.ignore_format = ignore_format + self.errors = errors def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ @@ -189,9 +210,19 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): # Some variables may contain more than 1 mode: if len(mode_vals) > 1: - raise ValueError( - f"The variable {var} contains multiple frequent categories." - ) + if self.errors == "raise": + raise ValueError( + f"The variable {var} contains multiple frequent categories. " + f"Set errors='warn' or errors='ignore' to allow imputation " + f"using the first most frequent category found." + ) + elif self.errors == "warn": + warnings.warn( + f"Variable {var} has multiple frequent categories. " + f"The first category found, {mode_vals[0]}, will be used " + f"for imputation.", + UserWarning, + ) self.imputer_dict_ = {var: mode_vals[0]} @@ -208,10 +239,19 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): varnames_str = ", ".join(varnames) else: varnames_str = varnames[0] - raise ValueError( - f"The variable(s) {varnames_str} contain(s) multiple frequent " - f"categories." - ) + + if self.errors == "raise": + raise ValueError( + f"The variable(s) {varnames_str} contain(s) multiple frequent " + f"categories. Set errors='warn' or errors='ignore' to allow " + f"imputation using the first most frequent category found." + ) + elif self.errors == "warn": + warnings.warn( + f"Variable(s) {varnames_str} have multiple frequent categories. " + f"The first category found will be used for imputation.", + UserWarning, + ) self.imputer_dict_ = mode_vals.iloc[0].to_dict() diff --git a/tests/test_imputation/test_categorical_imputer.py b/tests/test_imputation/test_categorical_imputer.py index 182e8826b..1e55212d5 100644 --- a/tests/test_imputation/test_categorical_imputer.py +++ b/tests/test_imputation/test_categorical_imputer.py @@ -1,8 +1,19 @@ +import numpy as np +import pandas as pd import pandas as pd import pytest +import warnings from feature_engine.imputation import CategoricalImputer +# --- Shared fixture: perfectly multimodal variable --- +@pytest.fixture +def multimodal_df(): + return pd.DataFrame({ + "city": ["London", "London", "Paris", "Paris", "Berlin", "Berlin"], + "country": ["UK", "UK", "FR", "FR", "DE", "DE"], + }) + def test_impute_with_string_missing_and_automatically_find_variables(df_na): # set up transformer @@ -150,14 +161,22 @@ def test_error_when_imputation_method_not_frequent_or_missing(): def test_error_when_variable_contains_multiple_modes(df_na): - msg = "The variable Name contains multiple frequent categories." + msg = ( + "The variable Name contains multiple frequent categories. " + "Set errors='warn' or errors='ignore' to allow imputation " + "using the first most frequent category found." + ) imputer = CategoricalImputer(imputation_method="frequent", variables="Name") with pytest.raises(ValueError) as record: imputer.fit(df_na) # check that error message matches assert str(record.value) == msg - msg = "The variable(s) Name contain(s) multiple frequent categories." + msg = ( + "The variable(s) Name contain(s) multiple frequent categories. " + "Set errors='warn' or errors='ignore' to allow imputation " + "using the first most frequent category found." + ) imputer = CategoricalImputer(imputation_method="frequent") with pytest.raises(ValueError) as record: imputer.fit(df_na) @@ -166,7 +185,11 @@ def test_error_when_variable_contains_multiple_modes(df_na): df_ = df_na.copy() df_["Name_dup"] = df_["Name"] - msg = "The variable(s) Name, Name_dup contain(s) multiple frequent categories." + msg = ( + "The variable(s) Name, Name_dup contain(s) multiple frequent categories. " + "Set errors='warn' or errors='ignore' to allow imputation " + "using the first most frequent category found." + ) imputer = CategoricalImputer(imputation_method="frequent") with pytest.raises(ValueError) as record: imputer.fit(df_) @@ -305,3 +328,51 @@ def test_error_when_ignore_format_is_not_boolean(ignore_format): # check that error message matches assert str(record.value) == msg + + +def test_errors_raise_on_multimodal_is_default(multimodal_df): + """Default behaviour: raise ValueError on multimodal variable.""" + imputer = CategoricalImputer(imputation_method="frequent") + with pytest.raises(ValueError, match="multiple frequent categories"): + imputer.fit(multimodal_df) + + +def test_errors_warn_emits_userwarning(multimodal_df): + """errors='warn': UserWarning must be emitted.""" + imputer = CategoricalImputer(imputation_method="frequent", errors="warn") + with pytest.warns(UserWarning, match="multiple frequent categories"): + imputer.fit(multimodal_df) + + +def test_errors_warn_uses_first_mode(multimodal_df): + """errors='warn': imputer_dict_ should contain the first mode.""" + imputer = CategoricalImputer(imputation_method="frequent", errors="warn") + with pytest.warns(UserWarning): + imputer.fit(multimodal_df) + expected = multimodal_df["city"].mode()[0] + assert imputer.imputer_dict_["city"] == expected + + +def test_errors_ignore_no_warning_raised(multimodal_df): + """errors='ignore': no warnings should be emitted.""" + imputer = CategoricalImputer(imputation_method="frequent", errors="ignore") + with warnings.catch_warnings(): + warnings.simplefilter("error") # Promote all warnings to errors + imputer.fit(multimodal_df) # Should NOT raise + assert imputer.imputer_dict_["city"] == multimodal_df["city"].mode()[0] + + +def test_errors_invalid_value_raises(): + """Passing an unsupported value for errors should raise ValueError at init.""" + with pytest.raises(ValueError, match="errors takes only values"): + CategoricalImputer(imputation_method="frequent", errors="bad_value") + + +def test_errors_param_ignored_when_imputation_method_is_missing(): + """errors param has no effect for imputation_method='missing'.""" + df = pd.DataFrame({"city": ["London", np.nan, "Paris"]}) + imputer = CategoricalImputer(imputation_method="missing", errors="warn") + # Should fit without warnings since there's no mode computation + with warnings.catch_warnings(): + warnings.simplefilter("error") + imputer.fit(df) From 81be3489fb56fc80ab1f8906bc5d12111bb19858 Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Sun, 8 Mar 2026 20:41:13 +0530 Subject: [PATCH 02/32] style: fix flake8 line length in CategoricalImputer --- feature_engine/imputation/categorical.py | 28 +++++++++++-------- .../test_categorical_imputer.py | 11 +++++--- 2 files changed, 23 insertions(+), 16 deletions(-) diff --git a/feature_engine/imputation/categorical.py b/feature_engine/imputation/categorical.py index 40c0a1276..cc1c2e2d2 100644 --- a/feature_engine/imputation/categorical.py +++ b/feature_engine/imputation/categorical.py @@ -212,15 +212,16 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): if len(mode_vals) > 1: if self.errors == "raise": raise ValueError( - f"The variable {var} contains multiple frequent categories. " - f"Set errors='warn' or errors='ignore' to allow imputation " - f"using the first most frequent category found." + f"The variable {var} contains multiple " + f"frequent categories. Set errors='warn' or " + f"errors='ignore' to allow imputation using " + f"the first most frequent category found." ) elif self.errors == "warn": warnings.warn( - f"Variable {var} has multiple frequent categories. " - f"The first category found, {mode_vals[0]}, will be used " - f"for imputation.", + f"Variable {var} has multiple frequent " + f"categories. The first category found, " + f"{mode_vals[0]}, will be used for imputation.", UserWarning, ) @@ -242,14 +243,17 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): if self.errors == "raise": raise ValueError( - f"The variable(s) {varnames_str} contain(s) multiple frequent " - f"categories. Set errors='warn' or errors='ignore' to allow " - f"imputation using the first most frequent category found." + f"The variable(s) {varnames_str} contain(s) " + f"multiple frequent categories. Set " + f"errors='warn' or errors='ignore' to allow " + f"imputation using the first most frequent " + f"category found." ) elif self.errors == "warn": warnings.warn( - f"Variable(s) {varnames_str} have multiple frequent categories. " - f"The first category found will be used for imputation.", + f"Variable(s) {varnames_str} have multiple " + f"frequent categories. The first category " + f"found will be used for imputation.", UserWarning, ) @@ -301,4 +305,4 @@ def _more_tags(self): def __sklearn_tags__(self): tags = super().__sklearn_tags__() tags.input_tags.allow_nan = True - return tags + return tags \ No newline at end of file diff --git a/tests/test_imputation/test_categorical_imputer.py b/tests/test_imputation/test_categorical_imputer.py index 1e55212d5..c6ea41d89 100644 --- a/tests/test_imputation/test_categorical_imputer.py +++ b/tests/test_imputation/test_categorical_imputer.py @@ -6,13 +6,16 @@ from feature_engine.imputation import CategoricalImputer + # --- Shared fixture: perfectly multimodal variable --- @pytest.fixture def multimodal_df(): - return pd.DataFrame({ - "city": ["London", "London", "Paris", "Paris", "Berlin", "Berlin"], - "country": ["UK", "UK", "FR", "FR", "DE", "DE"], - }) + return pd.DataFrame( + { + "city": ["London", "London", "Paris", "Paris", "Berlin", "Berlin"], + "country": ["UK", "UK", "FR", "FR", "DE", "DE"], + } + ) def test_impute_with_string_missing_and_automatically_find_variables(df_na): From 4fb5b7aa6cd37077cd91a046df8bf921e02e52b6 Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Sun, 8 Mar 2026 20:48:01 +0530 Subject: [PATCH 03/32] style: fix import order and duplicate pandas import --- feature_engine/imputation/categorical.py | 32 +++++++------------ .../test_categorical_imputer.py | 1 - 2 files changed, 12 insertions(+), 21 deletions(-) diff --git a/feature_engine/imputation/categorical.py b/feature_engine/imputation/categorical.py index cc1c2e2d2..2d1f48e97 100644 --- a/feature_engine/imputation/categorical.py +++ b/feature_engine/imputation/categorical.py @@ -1,34 +1,26 @@ # Authors: Soledad Galli # License: BSD 3 clause -from typing import List, Optional, Union import warnings +from typing import List, Optional, Union import pandas as pd -from feature_engine._check_init_parameters.check_variables import ( - _check_variables_input_value, -) +from feature_engine._check_init_parameters.check_variables import \ + _check_variables_input_value from feature_engine._docstrings.fit_attributes import ( - _feature_names_in_docstring, - _imputer_dict_docstring, - _n_features_in_docstring, - _variables_attribute_docstring, -) -from feature_engine._docstrings.methods import ( - _fit_transform_docstring, - _transform_imputers_docstring, -) + _feature_names_in_docstring, _imputer_dict_docstring, + _n_features_in_docstring, _variables_attribute_docstring) +from feature_engine._docstrings.methods import (_fit_transform_docstring, + _transform_imputers_docstring) from feature_engine._docstrings.substitute import Substitution from feature_engine.dataframe_checks import check_X from feature_engine.imputation.base_imputer import BaseImputer from feature_engine.tags import _return_tags -from feature_engine.variable_handling import ( - check_all_variables, - check_categorical_variables, - find_all_variables, - find_categorical_variables, -) +from feature_engine.variable_handling import (check_all_variables, + check_categorical_variables, + find_all_variables, + find_categorical_variables) @Substitution( @@ -305,4 +297,4 @@ def _more_tags(self): def __sklearn_tags__(self): tags = super().__sklearn_tags__() tags.input_tags.allow_nan = True - return tags \ No newline at end of file + return tags diff --git a/tests/test_imputation/test_categorical_imputer.py b/tests/test_imputation/test_categorical_imputer.py index c6ea41d89..788a7b924 100644 --- a/tests/test_imputation/test_categorical_imputer.py +++ b/tests/test_imputation/test_categorical_imputer.py @@ -1,6 +1,5 @@ import numpy as np import pandas as pd -import pandas as pd import pytest import warnings From 835133f4c12b072f09310d6a17c4f81aaadbc11f Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Sun, 8 Mar 2026 22:49:48 +0530 Subject: [PATCH 04/32] test: add coverage for errors='ignore' branches --- .../test_categorical_imputer.py | 27 ++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/tests/test_imputation/test_categorical_imputer.py b/tests/test_imputation/test_categorical_imputer.py index 788a7b924..995db0c69 100644 --- a/tests/test_imputation/test_categorical_imputer.py +++ b/tests/test_imputation/test_categorical_imputer.py @@ -1,7 +1,8 @@ +import warnings + import numpy as np import pandas as pd import pytest -import warnings from feature_engine.imputation import CategoricalImputer @@ -378,3 +379,27 @@ def test_errors_param_ignored_when_imputation_method_is_missing(): with warnings.catch_warnings(): warnings.simplefilter("error") imputer.fit(df) + + +def test_errors_ignore_single_variable(): + """errors='ignore' on single multimodal variable — silent, uses first mode.""" + X = pd.DataFrame( + {"city": ["London", "London", "Paris", "Paris", "Berlin", "Berlin"]} + ) + imputer = CategoricalImputer(imputation_method="frequent", errors="ignore") + imputer.fit(X) + assert imputer.imputer_dict_["city"] == X["city"].mode()[0] + + +def test_errors_ignore_multiple_variables(): + """errors='ignore' on multiple multimodal variables — silent, uses first mode.""" + X = pd.DataFrame( + { + "city": ["London", "London", "Paris", "Paris", "Berlin", "Berlin"], + "country": ["UK", "UK", "FR", "FR", "DE", "DE"], + } + ) + imputer = CategoricalImputer(imputation_method="frequent", errors="ignore") + imputer.fit(X) + assert imputer.imputer_dict_["city"] == X["city"].mode()[0] + assert imputer.imputer_dict_["country"] == X["country"].mode()[0] \ No newline at end of file From 81f31d8af4613b2fbfd2b7ebbdbc6f3fa087c4b7 Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Sun, 8 Mar 2026 22:53:33 +0530 Subject: [PATCH 05/32] style: add missing newline at end of test file --- tests/test_imputation/test_categorical_imputer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_imputation/test_categorical_imputer.py b/tests/test_imputation/test_categorical_imputer.py index 995db0c69..de4ce0bc4 100644 --- a/tests/test_imputation/test_categorical_imputer.py +++ b/tests/test_imputation/test_categorical_imputer.py @@ -402,4 +402,4 @@ def test_errors_ignore_multiple_variables(): imputer = CategoricalImputer(imputation_method="frequent", errors="ignore") imputer.fit(X) assert imputer.imputer_dict_["city"] == X["city"].mode()[0] - assert imputer.imputer_dict_["country"] == X["country"].mode()[0] \ No newline at end of file + assert imputer.imputer_dict_["country"] == X["country"].mode()[0] From 657de1f8468242f555b0a5fca602ad2e2374a8b6 Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Mon, 9 Mar 2026 19:19:12 +0530 Subject: [PATCH 06/32] Changes for codedev tests --- .../test_categorical_imputer.py | 47 +++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/tests/test_imputation/test_categorical_imputer.py b/tests/test_imputation/test_categorical_imputer.py index de4ce0bc4..bc9d69a04 100644 --- a/tests/test_imputation/test_categorical_imputer.py +++ b/tests/test_imputation/test_categorical_imputer.py @@ -403,3 +403,50 @@ def test_errors_ignore_multiple_variables(): imputer.fit(X) assert imputer.imputer_dict_["city"] == X["city"].mode()[0] assert imputer.imputer_dict_["country"] == X["country"].mode()[0] + + +# ============================================================================= +# NEW TESTS — added to fix codecov patch coverage (1 missing + 1 partial line) +# ============================================================================= + +def test_errors_warn_single_variable_emits_userwarning(): + """ + Covers the warnings.warn() inside the SINGLE-VARIABLE block of fit(). + + The existing test_errors_warn_emits_userwarning uses multimodal_df (2 columns), + which goes through the multi-variable code path. This test uses variables='city' + (a single variable) to hit the separate single-variable warn branch. + """ + X = pd.DataFrame( + {"city": ["London", "London", "Paris", "Paris", "Berlin", "Berlin"]} + ) + imputer = CategoricalImputer( + imputation_method="frequent", variables="city", errors="warn" + ) + with pytest.warns(UserWarning, match="multiple frequent categories"): + imputer.fit(X) + # First mode is used + assert imputer.imputer_dict_["city"] == X["city"].mode()[0] + + +def test_errors_raise_one_multimodal_among_multiple_variables(): + """ + Covers the `varnames_str = varnames[0]` else-branch in the MULTI-VARIABLE block. + + This branch is reached when multiple variables are selected but only ONE of them + turns out to have multiple modes. The existing tests either raise on all-multimodal + datasets (len(varnames) > 1) or use errors='ignore'/'warn' (skipping the raise). + Here we select two variables where only 'city' is multimodal, triggering the + singular else-branch before the ValueError is raised. + """ + X = pd.DataFrame( + { + # 'city': 3 equally frequent values → multimodal + "city": ["London", "London", "Paris", "Paris", "Berlin", "Berlin"], + # 'country': clear single mode (UK appears 3×, others once) + "country": ["UK", "UK", "UK", "FR", "DE", "SE"], + } + ) + imputer = CategoricalImputer(imputation_method="frequent", errors="raise") + with pytest.raises(ValueError, match="city"): + imputer.fit(X) \ No newline at end of file From a0ea71dc5f06afa68659efaf762823f5a7cf15d9 Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Mon, 16 Mar 2026 19:02:34 +0530 Subject: [PATCH 07/32] added space at last of test_categorical_imputer.py --- .gitignore | 1 + tests/test_imputation/test_categorical_imputer.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 3ba72acd9..0096d1595 100644 --- a/.gitignore +++ b/.gitignore @@ -86,6 +86,7 @@ celerybeat-schedule # Environments .env .venv +.venv_wsl env/ venv/ ENV/ diff --git a/tests/test_imputation/test_categorical_imputer.py b/tests/test_imputation/test_categorical_imputer.py index bc9d69a04..57fe62a3f 100644 --- a/tests/test_imputation/test_categorical_imputer.py +++ b/tests/test_imputation/test_categorical_imputer.py @@ -449,4 +449,4 @@ def test_errors_raise_one_multimodal_among_multiple_variables(): ) imputer = CategoricalImputer(imputation_method="frequent", errors="raise") with pytest.raises(ValueError, match="city"): - imputer.fit(X) \ No newline at end of file + imputer.fit(X) From 0cdcf03b018d2c8b181839922fb8298f213e7d13 Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Thu, 26 Mar 2026 22:53:09 +0530 Subject: [PATCH 08/32] Revert docs/whats_new/v_190.rst to upstream version --- docs/whats_new/v_190.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/whats_new/v_190.rst b/docs/whats_new/v_190.rst index f1b6e22da..3ee3222fb 100644 --- a/docs/whats_new/v_190.rst +++ b/docs/whats_new/v_190.rst @@ -53,7 +53,6 @@ New transformers Enhancements ~~~~~~~~~~~~ -- Added `errors` parameter to `CategoricalImputer` to handle categorical variables with multiple frequent categories instead of automatically raising a `ValueError`. (`DirekKakkar `_) - Our variable handling functions now return empty lists when no variables of the desired type are found. (`Soledad Galli `_) BUG From cf7670eb5431126c80e3febdeb103c0af4d08daa Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Thu, 26 Mar 2026 23:06:12 +0530 Subject: [PATCH 09/32] changes done to `feature_engine/imputation/categorical.py` --- feature_engine/imputation/categorical.py | 30 +++++++++++++++--------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/feature_engine/imputation/categorical.py b/feature_engine/imputation/categorical.py index 2d1f48e97..6996e8bad 100644 --- a/feature_engine/imputation/categorical.py +++ b/feature_engine/imputation/categorical.py @@ -6,21 +6,29 @@ import pandas as pd -from feature_engine._check_init_parameters.check_variables import \ - _check_variables_input_value +from +feature_engine._check_init_parameters.check_variables +import ( + _check_variables_input_value, +) from feature_engine._docstrings.fit_attributes import ( - _feature_names_in_docstring, _imputer_dict_docstring, - _n_features_in_docstring, _variables_attribute_docstring) + _feature_names_in_docstring, + _imputer_dict_docstring, + _n_features_in_docstring, + _variables_attribute_docstring +) from feature_engine._docstrings.methods import (_fit_transform_docstring, _transform_imputers_docstring) from feature_engine._docstrings.substitute import Substitution from feature_engine.dataframe_checks import check_X from feature_engine.imputation.base_imputer import BaseImputer from feature_engine.tags import _return_tags -from feature_engine.variable_handling import (check_all_variables, - check_categorical_variables, - find_all_variables, - find_categorical_variables) +from feature_engine.variable_handling import ( + check_all_variables, + check_categorical_variables, + find_all_variables, + find_categorical_variables +) @Substitution( @@ -81,8 +89,8 @@ class CategoricalImputer(BaseImputer): type object or categorical. If True, the imputer will select all variables or accept all variables entered by the user, including those cast as numeric. - errors : str, default='raise' - Indicates what to do when the selected imputation_method='frequent' + multimodal : str, default='raise' + Indicates what to do when imputation_method='frequent' and a variable has more than 1 mode. If 'raise', raises a ValueError and stops the fit. @@ -150,7 +158,7 @@ def __init__( if not isinstance(ignore_format, bool): raise ValueError("ignore_format takes only booleans True and False") - if errors not in ("raise", "warn", "ignore"): + if not isinstance(errors, str): raise ValueError( "errors takes only values 'raise', 'warn', or 'ignore'. " f"Got {errors} instead." From fb2f8db6191c659f392411f175f6633a5ef3634d Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Thu, 26 Mar 2026 23:07:48 +0530 Subject: [PATCH 10/32] changes made to `tests/test_imputation/test_categorical_imputer.py` --- tests/test_imputation/test_categorical_imputer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_imputation/test_categorical_imputer.py b/tests/test_imputation/test_categorical_imputer.py index 57fe62a3f..7874abd36 100644 --- a/tests/test_imputation/test_categorical_imputer.py +++ b/tests/test_imputation/test_categorical_imputer.py @@ -7,7 +7,6 @@ from feature_engine.imputation import CategoricalImputer -# --- Shared fixture: perfectly multimodal variable --- @pytest.fixture def multimodal_df(): return pd.DataFrame( From 97d6053b7eb1be7a16fe8a3be1693dc6c196a109 Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Thu, 26 Mar 2026 23:30:08 +0530 Subject: [PATCH 11/32] resolved comment done on R15 --- feature_engine/imputation/categorical.py | 6 +-- .../test_categorical_imputer.py | 45 +++++++++++-------- 2 files changed, 29 insertions(+), 22 deletions(-) diff --git a/feature_engine/imputation/categorical.py b/feature_engine/imputation/categorical.py index 6996e8bad..f4a4770c6 100644 --- a/feature_engine/imputation/categorical.py +++ b/feature_engine/imputation/categorical.py @@ -6,9 +6,7 @@ import pandas as pd -from -feature_engine._check_init_parameters.check_variables -import ( +from feature_engine._check_init_parameters.check_variables import ( _check_variables_input_value, ) from feature_engine._docstrings.fit_attributes import ( @@ -158,7 +156,7 @@ def __init__( if not isinstance(ignore_format, bool): raise ValueError("ignore_format takes only booleans True and False") - if not isinstance(errors, str): + if errors not in ["raise", "warn", "ignore"]: raise ValueError( "errors takes only values 'raise', 'warn', or 'ignore'. " f"Got {errors} instead." diff --git a/tests/test_imputation/test_categorical_imputer.py b/tests/test_imputation/test_categorical_imputer.py index 7874abd36..1ea0661f0 100644 --- a/tests/test_imputation/test_categorical_imputer.py +++ b/tests/test_imputation/test_categorical_imputer.py @@ -11,8 +11,9 @@ def multimodal_df(): return pd.DataFrame( { - "city": ["London", "London", "Paris", "Paris", "Berlin", "Berlin"], - "country": ["UK", "UK", "FR", "FR", "DE", "DE"], + "city": ["London", "London", "Paris", "Paris", "Berlin", "Berlin", "Madrid"], + "country": ["UK", "UK", "FR", "FR", "DE", "DE", "ES"], + "one_mode": ["London", "London", "London", "Paris", "Paris", "Berlin", "Berlin"], } ) @@ -332,18 +333,31 @@ def test_error_when_ignore_format_is_not_boolean(ignore_format): assert str(record.value) == msg -def test_errors_raise_on_multimodal_is_default(multimodal_df): - """Default behaviour: raise ValueError on multimodal variable.""" +def test_multimodal_raises_errors(multimodal_df): imputer = CategoricalImputer(imputation_method="frequent") - with pytest.raises(ValueError, match="multiple frequent categories"): + msg = ( + "The variable(s) city, country contain(s) multiple frequent categories. " + "Set errors='warn' or errors='ignore' to allow imputation " + "using the first most frequent category found." + ) + with pytest.raises(ValueError) as record: imputer.fit(multimodal_df) + assert str(record.value) == msg -def test_errors_warn_emits_userwarning(multimodal_df): - """errors='warn': UserWarning must be emitted.""" +def test_multimodal_raises_warning(multimodal_df): imputer = CategoricalImputer(imputation_method="frequent", errors="warn") - with pytest.warns(UserWarning, match="multiple frequent categories"): + msg = ( + "Variable(s) city, country have multiple frequent categories. " + "The first category found will be used for imputation." + ) + with pytest.warns(UserWarning, match="multiple frequent categories") as record: imputer.fit(multimodal_df) + # Filter for the specific warning message in case others were raised + matching_warnings = [ + w for w in record if "multiple frequent categories" in str(w.message) + ] + assert str(matching_warnings[0].message) == msg def test_errors_warn_uses_first_mode(multimodal_df): @@ -351,17 +365,19 @@ def test_errors_warn_uses_first_mode(multimodal_df): imputer = CategoricalImputer(imputation_method="frequent", errors="warn") with pytest.warns(UserWarning): imputer.fit(multimodal_df) - expected = multimodal_df["city"].mode()[0] - assert imputer.imputer_dict_["city"] == expected + assert imputer.imputer_dict_["city"] == multimodal_df["city"].mode()[0] + assert imputer.imputer_dict_["country"] == multimodal_df["country"].mode()[0] + assert imputer.imputer_dict_["one_mode"] == "London" def test_errors_ignore_no_warning_raised(multimodal_df): - """errors='ignore': no warnings should be emitted.""" imputer = CategoricalImputer(imputation_method="frequent", errors="ignore") with warnings.catch_warnings(): warnings.simplefilter("error") # Promote all warnings to errors imputer.fit(multimodal_df) # Should NOT raise assert imputer.imputer_dict_["city"] == multimodal_df["city"].mode()[0] + assert imputer.imputer_dict_["country"] == multimodal_df["country"].mode()[0] + assert imputer.imputer_dict_["one_mode"] == "London" def test_errors_invalid_value_raises(): @@ -409,13 +425,6 @@ def test_errors_ignore_multiple_variables(): # ============================================================================= def test_errors_warn_single_variable_emits_userwarning(): - """ - Covers the warnings.warn() inside the SINGLE-VARIABLE block of fit(). - - The existing test_errors_warn_emits_userwarning uses multimodal_df (2 columns), - which goes through the multi-variable code path. This test uses variables='city' - (a single variable) to hit the separate single-variable warn branch. - """ X = pd.DataFrame( {"city": ["London", "London", "Paris", "Paris", "Berlin", "Berlin"]} ) From c454edd5ee786b2dac970fb89e72b0c693be0248 Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Thu, 26 Mar 2026 23:40:15 +0530 Subject: [PATCH 12/32] reformated the error tests to match the error from within pytest --- .../test_categorical_imputer.py | 24 +++++++------------ 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/tests/test_imputation/test_categorical_imputer.py b/tests/test_imputation/test_categorical_imputer.py index 1ea0661f0..3cd3658ab 100644 --- a/tests/test_imputation/test_categorical_imputer.py +++ b/tests/test_imputation/test_categorical_imputer.py @@ -1,3 +1,4 @@ +import re import warnings import numpy as np @@ -159,7 +160,8 @@ def test_imputation_of_numerical_vars_cast_as_object_and_returned_as_object(df_n def test_error_when_imputation_method_not_frequent_or_missing(): - with pytest.raises(ValueError): + msg = "imputation_method takes only values 'missing' or 'frequent'" + with pytest.raises(ValueError, match=msg): CategoricalImputer(imputation_method="arbitrary") @@ -170,10 +172,8 @@ def test_error_when_variable_contains_multiple_modes(df_na): "using the first most frequent category found." ) imputer = CategoricalImputer(imputation_method="frequent", variables="Name") - with pytest.raises(ValueError) as record: + with pytest.raises(ValueError, match=re.escape(msg)): imputer.fit(df_na) - # check that error message matches - assert str(record.value) == msg msg = ( "The variable(s) Name contain(s) multiple frequent categories. " @@ -181,10 +181,8 @@ def test_error_when_variable_contains_multiple_modes(df_na): "using the first most frequent category found." ) imputer = CategoricalImputer(imputation_method="frequent") - with pytest.raises(ValueError) as record: + with pytest.raises(ValueError, match=re.escape(msg)): imputer.fit(df_na) - # check that error message matches - assert str(record.value) == msg df_ = df_na.copy() df_["Name_dup"] = df_["Name"] @@ -194,10 +192,8 @@ def test_error_when_variable_contains_multiple_modes(df_na): "using the first most frequent category found." ) imputer = CategoricalImputer(imputation_method="frequent") - with pytest.raises(ValueError) as record: + with pytest.raises(ValueError, match=re.escape(msg)): imputer.fit(df_) - # check that error message matches - assert str(record.value) == msg def test_impute_numerical_variables(df_na): @@ -326,12 +322,9 @@ def test_variables_cast_as_category_frequent(df_na): ) def test_error_when_ignore_format_is_not_boolean(ignore_format): msg = "ignore_format takes only booleans True and False" - with pytest.raises(ValueError) as record: + with pytest.raises(ValueError, match=msg): CategoricalImputer(imputation_method="missing", ignore_format=ignore_format) - # check that error message matches - assert str(record.value) == msg - def test_multimodal_raises_errors(multimodal_df): imputer = CategoricalImputer(imputation_method="frequent") @@ -340,9 +333,8 @@ def test_multimodal_raises_errors(multimodal_df): "Set errors='warn' or errors='ignore' to allow imputation " "using the first most frequent category found." ) - with pytest.raises(ValueError) as record: + with pytest.raises(ValueError, match=re.escape(msg)): imputer.fit(multimodal_df) - assert str(record.value) == msg def test_multimodal_raises_warning(multimodal_df): From 5992d09aa27554044e01859b7e6bab998cd121c8 Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Thu, 26 Mar 2026 23:47:28 +0530 Subject: [PATCH 13/32] made three tests in on test --- .../test_categorical_imputer.py | 40 +++++-------------- 1 file changed, 11 insertions(+), 29 deletions(-) diff --git a/tests/test_imputation/test_categorical_imputer.py b/tests/test_imputation/test_categorical_imputer.py index 3cd3658ab..fa918ed90 100644 --- a/tests/test_imputation/test_categorical_imputer.py +++ b/tests/test_imputation/test_categorical_imputer.py @@ -337,36 +337,18 @@ def test_multimodal_raises_errors(multimodal_df): imputer.fit(multimodal_df) -def test_multimodal_raises_warning(multimodal_df): - imputer = CategoricalImputer(imputation_method="frequent", errors="warn") - msg = ( - "Variable(s) city, country have multiple frequent categories. " - "The first category found will be used for imputation." - ) - with pytest.warns(UserWarning, match="multiple frequent categories") as record: - imputer.fit(multimodal_df) - # Filter for the specific warning message in case others were raised - matching_warnings = [ - w for w in record if "multiple frequent categories" in str(w.message) - ] - assert str(matching_warnings[0].message) == msg - - -def test_errors_warn_uses_first_mode(multimodal_df): - """errors='warn': imputer_dict_ should contain the first mode.""" - imputer = CategoricalImputer(imputation_method="frequent", errors="warn") - with pytest.warns(UserWarning): - imputer.fit(multimodal_df) - assert imputer.imputer_dict_["city"] == multimodal_df["city"].mode()[0] - assert imputer.imputer_dict_["country"] == multimodal_df["country"].mode()[0] - assert imputer.imputer_dict_["one_mode"] == "London" +@pytest.mark.parametrize("errors", ["warn", "ignore"]) +def test_multimodal_imputation_result(multimodal_df, errors): + """Check that result is the same when errors='warn' or 'ignore'.""" + imputer = CategoricalImputer(imputation_method="frequent", errors=errors) + if errors == "warn": + with pytest.warns(UserWarning, match="multiple frequent categories"): + imputer.fit(multimodal_df) + else: + with warnings.catch_warnings(): + warnings.simplefilter("error") + imputer.fit(multimodal_df) - -def test_errors_ignore_no_warning_raised(multimodal_df): - imputer = CategoricalImputer(imputation_method="frequent", errors="ignore") - with warnings.catch_warnings(): - warnings.simplefilter("error") # Promote all warnings to errors - imputer.fit(multimodal_df) # Should NOT raise assert imputer.imputer_dict_["city"] == multimodal_df["city"].mode()[0] assert imputer.imputer_dict_["country"] == multimodal_df["country"].mode()[0] assert imputer.imputer_dict_["one_mode"] == "London" From 85b1974bb813d7dd7cd3d76c217a1583833446f5 Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Thu, 26 Mar 2026 23:49:09 +0530 Subject: [PATCH 14/32] left change --- tests/test_imputation/test_categorical_imputer.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/test_imputation/test_categorical_imputer.py b/tests/test_imputation/test_categorical_imputer.py index fa918ed90..e86262554 100644 --- a/tests/test_imputation/test_categorical_imputer.py +++ b/tests/test_imputation/test_categorical_imputer.py @@ -345,9 +345,10 @@ def test_multimodal_imputation_result(multimodal_df, errors): with pytest.warns(UserWarning, match="multiple frequent categories"): imputer.fit(multimodal_df) else: - with warnings.catch_warnings(): - warnings.simplefilter("error") + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") imputer.fit(multimodal_df) + assert len(w) == 0 assert imputer.imputer_dict_["city"] == multimodal_df["city"].mode()[0] assert imputer.imputer_dict_["country"] == multimodal_df["country"].mode()[0] From 09429f3603962c2eefb8773232e32af685539dd3 Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Thu, 26 Mar 2026 23:56:25 +0530 Subject: [PATCH 15/32] refaactored the multimodal tests --- tests/test_imputation/test_categorical_imputer.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test_imputation/test_categorical_imputer.py b/tests/test_imputation/test_categorical_imputer.py index e86262554..37e4a0be5 100644 --- a/tests/test_imputation/test_categorical_imputer.py +++ b/tests/test_imputation/test_categorical_imputer.py @@ -348,11 +348,11 @@ def test_multimodal_imputation_result(multimodal_df, errors): with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") imputer.fit(multimodal_df) - assert len(w) == 0 - - assert imputer.imputer_dict_["city"] == multimodal_df["city"].mode()[0] - assert imputer.imputer_dict_["country"] == multimodal_df["country"].mode()[0] - assert imputer.imputer_dict_["one_mode"] == "London" + # Check that no warnings with the specific message were raised + matching_warnings = [ + msg for msg in w if "multiple frequent categories" in str(msg.message) + ] + assert len(matching_warnings) == 0 def test_errors_invalid_value_raises(): From 0b86cfa702fe803f41706026f7c81794322202ed Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Fri, 27 Mar 2026 00:01:00 +0530 Subject: [PATCH 16/32] refactored test_errors_invalid_value_raises --- tests/test_imputation/test_categorical_imputer.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/test_imputation/test_categorical_imputer.py b/tests/test_imputation/test_categorical_imputer.py index 37e4a0be5..7929854b4 100644 --- a/tests/test_imputation/test_categorical_imputer.py +++ b/tests/test_imputation/test_categorical_imputer.py @@ -355,10 +355,11 @@ def test_multimodal_imputation_result(multimodal_df, errors): assert len(matching_warnings) == 0 -def test_errors_invalid_value_raises(): +@pytest.mark.parametrize("errors", ["bad_value", 1, True]) +def test_errors_invalid_value_raises(errors): """Passing an unsupported value for errors should raise ValueError at init.""" with pytest.raises(ValueError, match="errors takes only values"): - CategoricalImputer(imputation_method="frequent", errors="bad_value") + CategoricalImputer(imputation_method="frequent", errors=errors) def test_errors_param_ignored_when_imputation_method_is_missing(): From 45f4e2f9749e5c1b55a9464fb4e67d1f09b30c91 Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Fri, 27 Mar 2026 00:06:48 +0530 Subject: [PATCH 17/32] changed the function `test_errors_param_ignored_when_imputation_method_is_missing` --- tests/test_imputation/test_categorical_imputer.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/test_imputation/test_categorical_imputer.py b/tests/test_imputation/test_categorical_imputer.py index 7929854b4..5d3ecb6ef 100644 --- a/tests/test_imputation/test_categorical_imputer.py +++ b/tests/test_imputation/test_categorical_imputer.py @@ -367,9 +367,13 @@ def test_errors_param_ignored_when_imputation_method_is_missing(): df = pd.DataFrame({"city": ["London", np.nan, "Paris"]}) imputer = CategoricalImputer(imputation_method="missing", errors="warn") # Should fit without warnings since there's no mode computation - with warnings.catch_warnings(): - warnings.simplefilter("error") + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") imputer.fit(df) + matching_warnings = [ + msg for msg in w if "multiple frequent categories" in str(msg.message) + ] + assert len(matching_warnings) == 0 def test_errors_ignore_single_variable(): From cda93e70b49a57fc5f54f7463fda14cd3f92ba06 Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Fri, 27 Mar 2026 00:10:51 +0530 Subject: [PATCH 18/32] removed `test_errors_ignore_single_variable` `test_errors_ignore_multiple_variables` --- .../test_categorical_imputer.py | 24 ------------------- 1 file changed, 24 deletions(-) diff --git a/tests/test_imputation/test_categorical_imputer.py b/tests/test_imputation/test_categorical_imputer.py index 5d3ecb6ef..b2e3f9726 100644 --- a/tests/test_imputation/test_categorical_imputer.py +++ b/tests/test_imputation/test_categorical_imputer.py @@ -376,30 +376,6 @@ def test_errors_param_ignored_when_imputation_method_is_missing(): assert len(matching_warnings) == 0 -def test_errors_ignore_single_variable(): - """errors='ignore' on single multimodal variable — silent, uses first mode.""" - X = pd.DataFrame( - {"city": ["London", "London", "Paris", "Paris", "Berlin", "Berlin"]} - ) - imputer = CategoricalImputer(imputation_method="frequent", errors="ignore") - imputer.fit(X) - assert imputer.imputer_dict_["city"] == X["city"].mode()[0] - - -def test_errors_ignore_multiple_variables(): - """errors='ignore' on multiple multimodal variables — silent, uses first mode.""" - X = pd.DataFrame( - { - "city": ["London", "London", "Paris", "Paris", "Berlin", "Berlin"], - "country": ["UK", "UK", "FR", "FR", "DE", "DE"], - } - ) - imputer = CategoricalImputer(imputation_method="frequent", errors="ignore") - imputer.fit(X) - assert imputer.imputer_dict_["city"] == X["city"].mode()[0] - assert imputer.imputer_dict_["country"] == X["country"].mode()[0] - - # ============================================================================= # NEW TESTS — added to fix codecov patch coverage (1 missing + 1 partial line) # ============================================================================= From 04be1a0255416b25c1a608e1fd481c9c7c89c876 Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Fri, 27 Mar 2026 00:11:41 +0530 Subject: [PATCH 19/32] emove the commented block --- tests/test_imputation/test_categorical_imputer.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/test_imputation/test_categorical_imputer.py b/tests/test_imputation/test_categorical_imputer.py index b2e3f9726..011a8b2f8 100644 --- a/tests/test_imputation/test_categorical_imputer.py +++ b/tests/test_imputation/test_categorical_imputer.py @@ -376,10 +376,6 @@ def test_errors_param_ignored_when_imputation_method_is_missing(): assert len(matching_warnings) == 0 -# ============================================================================= -# NEW TESTS — added to fix codecov patch coverage (1 missing + 1 partial line) -# ============================================================================= - def test_errors_warn_single_variable_emits_userwarning(): X = pd.DataFrame( {"city": ["London", "London", "Paris", "Paris", "Berlin", "Berlin"]} From 94643d8d7fc27c6acbabc922dacb68d5ed18be17 Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Fri, 27 Mar 2026 00:15:30 +0530 Subject: [PATCH 20/32] last few changes made --- .../test_categorical_imputer.py | 31 +++++-------------- 1 file changed, 8 insertions(+), 23 deletions(-) diff --git a/tests/test_imputation/test_categorical_imputer.py b/tests/test_imputation/test_categorical_imputer.py index 011a8b2f8..c0819ea6b 100644 --- a/tests/test_imputation/test_categorical_imputer.py +++ b/tests/test_imputation/test_categorical_imputer.py @@ -376,37 +376,22 @@ def test_errors_param_ignored_when_imputation_method_is_missing(): assert len(matching_warnings) == 0 -def test_errors_warn_single_variable_emits_userwarning(): - X = pd.DataFrame( - {"city": ["London", "London", "Paris", "Paris", "Berlin", "Berlin"]} - ) +def test_warning_when_single_variable_is_multimodal(multimodal_df): imputer = CategoricalImputer( imputation_method="frequent", variables="city", errors="warn" ) with pytest.warns(UserWarning, match="multiple frequent categories"): - imputer.fit(X) - # First mode is used - assert imputer.imputer_dict_["city"] == X["city"].mode()[0] + imputer.fit(multimodal_df) + assert imputer.imputer_dict_["city"] == multimodal_df["city"].mode()[0] -def test_errors_raise_one_multimodal_among_multiple_variables(): +def test_errors_raise_when_only_one_variable_is_multimodal(multimodal_df): """ - Covers the `varnames_str = varnames[0]` else-branch in the MULTI-VARIABLE block. - This branch is reached when multiple variables are selected but only ONE of them - turns out to have multiple modes. The existing tests either raise on all-multimodal - datasets (len(varnames) > 1) or use errors='ignore'/'warn' (skipping the raise). - Here we select two variables where only 'city' is multimodal, triggering the - singular else-branch before the ValueError is raised. + turns out to have multiple modes. """ - X = pd.DataFrame( - { - # 'city': 3 equally frequent values → multimodal - "city": ["London", "London", "Paris", "Paris", "Berlin", "Berlin"], - # 'country': clear single mode (UK appears 3×, others once) - "country": ["UK", "UK", "UK", "FR", "DE", "SE"], - } + imputer = CategoricalImputer( + imputation_method="frequent", variables=["city", "one_mode"], errors="raise" ) - imputer = CategoricalImputer(imputation_method="frequent", errors="raise") with pytest.raises(ValueError, match="city"): - imputer.fit(X) + imputer.fit(multimodal_df) From ab6ba66033d979a882d2e3838905129c4a0d46e1 Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Fri, 27 Mar 2026 00:37:12 +0530 Subject: [PATCH 21/32] test case style updated --- tests/test_imputation/test_categorical_imputer.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/test_imputation/test_categorical_imputer.py b/tests/test_imputation/test_categorical_imputer.py index c0819ea6b..f5d0b8de0 100644 --- a/tests/test_imputation/test_categorical_imputer.py +++ b/tests/test_imputation/test_categorical_imputer.py @@ -12,9 +12,13 @@ def multimodal_df(): return pd.DataFrame( { - "city": ["London", "London", "Paris", "Paris", "Berlin", "Berlin", "Madrid"], + "city": [ + "London", "London", "Paris", "Paris", "Berlin", "Berlin", "Madrid" + ], "country": ["UK", "UK", "FR", "FR", "DE", "DE", "ES"], - "one_mode": ["London", "London", "London", "Paris", "Paris", "Berlin", "Berlin"], + "one_mode": [ + "London", "London", "London", "Paris", "Paris", "Berlin", "Berlin" + ], } ) From 6ba7fceb7eee54f36c7ca5db821ca4c709029231 Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Fri, 27 Mar 2026 19:49:34 +0530 Subject: [PATCH 22/32] Renamed `errors` to `multimodal` in CategoricalImputer and add missing test --- feature_engine/imputation/categorical.py | 24 ++++---- .../test_categorical_imputer.py | 56 ++++++++++++------- 2 files changed, 48 insertions(+), 32 deletions(-) diff --git a/feature_engine/imputation/categorical.py b/feature_engine/imputation/categorical.py index f4a4770c6..42e5002c3 100644 --- a/feature_engine/imputation/categorical.py +++ b/feature_engine/imputation/categorical.py @@ -146,7 +146,7 @@ def __init__( variables: Union[None, int, str, List[Union[str, int]]] = None, return_object: bool = False, ignore_format: bool = False, - errors: str = "raise", + multimodal: str = "raise", ) -> None: if imputation_method not in ["missing", "frequent"]: raise ValueError( @@ -156,10 +156,10 @@ def __init__( if not isinstance(ignore_format, bool): raise ValueError("ignore_format takes only booleans True and False") - if errors not in ["raise", "warn", "ignore"]: + if multimodal not in ["raise", "warn", "ignore"]: raise ValueError( - "errors takes only values 'raise', 'warn', or 'ignore'. " - f"Got {errors} instead." + "multimodal takes only values 'raise', 'warn', or 'ignore'. " + f"Got {multimodal} instead." ) self.imputation_method = imputation_method @@ -167,7 +167,7 @@ def __init__( self.variables = _check_variables_input_value(variables) self.return_object = return_object self.ignore_format = ignore_format - self.errors = errors + self.multimodal = multimodal def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ @@ -208,14 +208,14 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): # Some variables may contain more than 1 mode: if len(mode_vals) > 1: - if self.errors == "raise": + if self.multimodal == "raise": raise ValueError( f"The variable {var} contains multiple " - f"frequent categories. Set errors='warn' or " - f"errors='ignore' to allow imputation using " + f"frequent categories. Set multimodal='warn' or " + f"multimodal='ignore' to allow imputation using " f"the first most frequent category found." ) - elif self.errors == "warn": + elif self.multimodal == "warn": warnings.warn( f"Variable {var} has multiple frequent " f"categories. The first category found, " @@ -239,15 +239,15 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): else: varnames_str = varnames[0] - if self.errors == "raise": + if self.multimodal == "raise": raise ValueError( f"The variable(s) {varnames_str} contain(s) " f"multiple frequent categories. Set " - f"errors='warn' or errors='ignore' to allow " + f"multimodal='warn' or multimodal='ignore' to allow " f"imputation using the first most frequent " f"category found." ) - elif self.errors == "warn": + elif self.multimodal == "warn": warnings.warn( f"Variable(s) {varnames_str} have multiple " f"frequent categories. The first category " diff --git a/tests/test_imputation/test_categorical_imputer.py b/tests/test_imputation/test_categorical_imputer.py index f5d0b8de0..066baa0f7 100644 --- a/tests/test_imputation/test_categorical_imputer.py +++ b/tests/test_imputation/test_categorical_imputer.py @@ -172,7 +172,7 @@ def test_error_when_imputation_method_not_frequent_or_missing(): def test_error_when_variable_contains_multiple_modes(df_na): msg = ( "The variable Name contains multiple frequent categories. " - "Set errors='warn' or errors='ignore' to allow imputation " + "Set multimodal='warn' or multimodal='ignore' to allow imputation " "using the first most frequent category found." ) imputer = CategoricalImputer(imputation_method="frequent", variables="Name") @@ -181,7 +181,7 @@ def test_error_when_variable_contains_multiple_modes(df_na): msg = ( "The variable(s) Name contain(s) multiple frequent categories. " - "Set errors='warn' or errors='ignore' to allow imputation " + "Set multimodal='warn' or multimodal='ignore' to allow imputation " "using the first most frequent category found." ) imputer = CategoricalImputer(imputation_method="frequent") @@ -192,7 +192,7 @@ def test_error_when_variable_contains_multiple_modes(df_na): df_["Name_dup"] = df_["Name"] msg = ( "The variable(s) Name, Name_dup contain(s) multiple frequent categories. " - "Set errors='warn' or errors='ignore' to allow imputation " + "Set multimodal='warn' or multimodal='ignore' to allow imputation " "using the first most frequent category found." ) imputer = CategoricalImputer(imputation_method="frequent") @@ -334,18 +334,18 @@ def test_multimodal_raises_errors(multimodal_df): imputer = CategoricalImputer(imputation_method="frequent") msg = ( "The variable(s) city, country contain(s) multiple frequent categories. " - "Set errors='warn' or errors='ignore' to allow imputation " + "Set multimodal='warn' or multimodal='ignore' to allow imputation " "using the first most frequent category found." ) with pytest.raises(ValueError, match=re.escape(msg)): imputer.fit(multimodal_df) -@pytest.mark.parametrize("errors", ["warn", "ignore"]) -def test_multimodal_imputation_result(multimodal_df, errors): - """Check that result is the same when errors='warn' or 'ignore'.""" - imputer = CategoricalImputer(imputation_method="frequent", errors=errors) - if errors == "warn": +@pytest.mark.parametrize("multimodal", ["warn", "ignore"]) +def test_multimodal_imputation_result(multimodal_df, multimodal): + """Check that result is the same when multimodal='warn' or 'ignore'.""" + imputer = CategoricalImputer(imputation_method="frequent", multimodal=multimodal) + if multimodal == "warn": with pytest.warns(UserWarning, match="multiple frequent categories"): imputer.fit(multimodal_df) else: @@ -359,17 +359,17 @@ def test_multimodal_imputation_result(multimodal_df, errors): assert len(matching_warnings) == 0 -@pytest.mark.parametrize("errors", ["bad_value", 1, True]) -def test_errors_invalid_value_raises(errors): - """Passing an unsupported value for errors should raise ValueError at init.""" - with pytest.raises(ValueError, match="errors takes only values"): - CategoricalImputer(imputation_method="frequent", errors=errors) +@pytest.mark.parametrize("multimodal", ["bad_value", 1, True]) +def test_multimodal_invalid_value_raises(multimodal): + """Passing an unsupported value for multimodal should raise ValueError at init.""" + with pytest.raises(ValueError, match="multimodal takes only values"): + CategoricalImputer(imputation_method="frequent", multimodal=multimodal) -def test_errors_param_ignored_when_imputation_method_is_missing(): - """errors param has no effect for imputation_method='missing'.""" +def test_multimodal_param_ignored_when_imputation_method_is_missing(): + """multimodal param has no effect for imputation_method='missing'.""" df = pd.DataFrame({"city": ["London", np.nan, "Paris"]}) - imputer = CategoricalImputer(imputation_method="missing", errors="warn") + imputer = CategoricalImputer(imputation_method="missing", multimodal="warn") # Should fit without warnings since there's no mode computation with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") @@ -382,20 +382,36 @@ def test_errors_param_ignored_when_imputation_method_is_missing(): def test_warning_when_single_variable_is_multimodal(multimodal_df): imputer = CategoricalImputer( - imputation_method="frequent", variables="city", errors="warn" + imputation_method="frequent", variables="city", multimodal="warn" ) with pytest.warns(UserWarning, match="multiple frequent categories"): imputer.fit(multimodal_df) assert imputer.imputer_dict_["city"] == multimodal_df["city"].mode()[0] -def test_errors_raise_when_only_one_variable_is_multimodal(multimodal_df): +def test_warning_when_single_variable_in_list_is_multimodal(multimodal_df): + # Test for multimodal='warn' when passing 1 variable in a list + # to the variables parameter. + imputer = CategoricalImputer( + imputation_method="frequent", variables=["city"], multimodal="warn" + ) + with pytest.warns(UserWarning) as record: + imputer.fit(multimodal_df) + + # check that warning was raised exactly once + assert len(record) == 1 + # check that warning message is as expected + assert "Variable city has multiple frequent categories" in str(record[0].message) + assert imputer.imputer_dict_["city"] == multimodal_df["city"].mode()[0] + + +def test_multimodal_raise_when_only_one_variable_is_multimodal(multimodal_df): """ This branch is reached when multiple variables are selected but only ONE of them turns out to have multiple modes. """ imputer = CategoricalImputer( - imputation_method="frequent", variables=["city", "one_mode"], errors="raise" + imputation_method="frequent", variables=["city", "one_mode"], multimodal="raise" ) with pytest.raises(ValueError, match="city"): imputer.fit(multimodal_df) From 1a3fde2b0913d4b9d2467bcb5438fbdfa848c64b Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Fri, 27 Mar 2026 19:56:36 +0530 Subject: [PATCH 23/32] Apply suggestion from @solegalli Co-authored-by: Soledad Galli --- feature_engine/imputation/categorical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feature_engine/imputation/categorical.py b/feature_engine/imputation/categorical.py index 42e5002c3..db5a7be04 100644 --- a/feature_engine/imputation/categorical.py +++ b/feature_engine/imputation/categorical.py @@ -93,7 +93,7 @@ class CategoricalImputer(BaseImputer): If 'raise', raises a ValueError and stops the fit. - If 'warn', raises a UserWarning and continues, imputing using the + If 'warn', raises a UserWarning and continues the imputation using the first most frequent category found. If 'ignore', continues without warnings, imputing using the first From 36eb1dcafb5273fb7e1a180d69fe06e435abcdc2 Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Fri, 27 Mar 2026 19:56:46 +0530 Subject: [PATCH 24/32] Apply suggestion from @solegalli Co-authored-by: Soledad Galli --- feature_engine/imputation/categorical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feature_engine/imputation/categorical.py b/feature_engine/imputation/categorical.py index db5a7be04..4c15e8573 100644 --- a/feature_engine/imputation/categorical.py +++ b/feature_engine/imputation/categorical.py @@ -25,7 +25,7 @@ check_all_variables, check_categorical_variables, find_all_variables, - find_categorical_variables + find_categorical_variables, ) From aa37d1915bbf4a383b9870b8cedacd1370a4c76f Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Fri, 27 Mar 2026 19:58:03 +0530 Subject: [PATCH 25/32] Update categorical.py --- feature_engine/imputation/categorical.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/feature_engine/imputation/categorical.py b/feature_engine/imputation/categorical.py index 4c15e8573..baaff30ea 100644 --- a/feature_engine/imputation/categorical.py +++ b/feature_engine/imputation/categorical.py @@ -15,8 +15,10 @@ _n_features_in_docstring, _variables_attribute_docstring ) -from feature_engine._docstrings.methods import (_fit_transform_docstring, - _transform_imputers_docstring) +from feature_engine._docstrings.methods import ( + _fit_transform_docstring, + _transform_imputers_docstring, +) from feature_engine._docstrings.substitute import Substitution from feature_engine.dataframe_checks import check_X from feature_engine.imputation.base_imputer import BaseImputer From 3e58d8bc22f14322c306c28ddaac58451829e7e2 Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Fri, 27 Mar 2026 21:08:33 +0530 Subject: [PATCH 26/32] removed comments and added tests --- feature_engine/imputation/categorical.py | 17 +---- .../test_categorical_imputer.py | 63 +++++-------------- 2 files changed, 17 insertions(+), 63 deletions(-) diff --git a/feature_engine/imputation/categorical.py b/feature_engine/imputation/categorical.py index 42e5002c3..60e0ff75c 100644 --- a/feature_engine/imputation/categorical.py +++ b/feature_engine/imputation/categorical.py @@ -182,10 +182,8 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): y is not needed in this imputation. You can pass None or y. """ - # check input dataframe X = check_X(X) - # select variables to encode if self.ignore_format is True: if self.variables is None: self.variables_ = find_all_variables(X) @@ -201,12 +199,10 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): self.imputer_dict_ = {var: self.fill_value for var in self.variables_} elif self.imputation_method == "frequent": - # if imputing only 1 variable: if len(self.variables_) == 1: var = self.variables_[0] mode_vals = X[var].mode() - # Some variables may contain more than 1 mode: if len(mode_vals) > 1: if self.multimodal == "raise": raise ValueError( @@ -225,13 +221,9 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): self.imputer_dict_ = {var: mode_vals[0]} - # imputing multiple variables: else: - # Returns a dataframe with 1 row if there is one mode per - # variable, or more rows if there are more modes: mode_vals = X[self.variables_].mode() - # Careful: some variables contain multiple modes if len(mode_vals) > 1: varnames = mode_vals.dropna(axis=1).columns.to_list() if len(varnames) > 1: @@ -262,16 +254,14 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): return self def transform(self, X: pd.DataFrame) -> pd.DataFrame: - # Frequent category imputation if self.imputation_method == "frequent": X = super().transform(X) - # Imputation with string + else: X = self._transform(X) - # if variable is of type category, we need to add the new - # category, before filling in the nan + add_cats = {} for variable in self.variables_: if X[variable].dtype.name == "category": @@ -285,13 +275,12 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: X = X.assign(**add_cats).fillna(self.imputer_dict_) - # add additional step to return variables cast as object + if self.return_object: X[self.variables_] = X[self.variables_].astype("O") return X - # Get docstring from BaseClass transform.__doc__ = BaseImputer.transform.__doc__ def _more_tags(self): diff --git a/tests/test_imputation/test_categorical_imputer.py b/tests/test_imputation/test_categorical_imputer.py index 066baa0f7..bf216b4d0 100644 --- a/tests/test_imputation/test_categorical_imputer.py +++ b/tests/test_imputation/test_categorical_imputer.py @@ -56,24 +56,20 @@ def test_impute_with_string_missing_and_automatically_find_variables(df_na): def test_user_defined_string_and_automatically_find_variables(df_na): - # set up imputer imputer = CategoricalImputer( imputation_method="missing", fill_value="Unknown", variables=None ) X_transformed = imputer.fit_transform(df_na) - # set up expected output X_reference = df_na.copy() X_reference["Name"] = X_reference["Name"].fillna("Unknown") X_reference["City"] = X_reference["City"].fillna("Unknown") X_reference["Studies"] = X_reference["Studies"].fillna("Unknown") - # test init params assert imputer.imputation_method == "missing" assert imputer.fill_value == "Unknown" assert imputer.variables is None - # tes fit attributes assert imputer.variables_ == ["Name", "City", "Studies"] assert imputer.n_features_in_ == 6 assert imputer.imputer_dict_ == { @@ -82,22 +78,18 @@ def test_user_defined_string_and_automatically_find_variables(df_na): "Studies": "Unknown", } - # test transform output: assert X_transformed[["Name", "City", "Studies"]].isnull().sum().sum() == 0 assert X_transformed[["Age", "Marks"]].isnull().sum().sum() > 0 pd.testing.assert_frame_equal(X_transformed, X_reference) def test_mode_imputation_and_single_variable(df_na): - # set up imputer imputer = CategoricalImputer(imputation_method="frequent", variables="City") X_transformed = imputer.fit_transform(df_na) - # set up expected result X_reference = df_na.copy() X_reference["City"] = X_reference["City"].fillna("London") - # test init, fit and transform params, attr and output assert imputer.imputation_method == "frequent" assert imputer.variables == "City" assert imputer.variables_ == ["City"] @@ -109,24 +101,20 @@ def test_mode_imputation_and_single_variable(df_na): def test_mode_imputation_with_multiple_variables(df_na): - # set up imputer imputer = CategoricalImputer( imputation_method="frequent", variables=["Studies", "City"] ) X_transformed = imputer.fit_transform(df_na) - # set up expected output X_reference = df_na.copy() X_reference["City"] = X_reference["City"].fillna("London") X_reference["Studies"] = X_reference["Studies"].fillna("Bachelor") - # test fit attr and transform output assert imputer.imputer_dict_ == {"Studies": "Bachelor", "City": "London"} pd.testing.assert_frame_equal(X_transformed, X_reference) def test_imputation_of_numerical_vars_cast_as_object_and_returned_as_numerical(df_na): - # test case: imputing of numerical variables cast as object + return numeric df_na = df_na.copy() df_na["Marks"] = df_na["Marks"].astype("O") imputer = CategoricalImputer( @@ -150,8 +138,6 @@ def test_imputation_of_numerical_vars_cast_as_object_and_returned_as_numerical(d def test_imputation_of_numerical_vars_cast_as_object_and_returned_as_object(df_na): - # test case 6: imputing of numerical variables cast as object + return as object - # after imputation df_na = df_na.copy() df_na["Marks"] = df_na["Marks"].astype("O") imputer = CategoricalImputer( @@ -201,7 +187,6 @@ def test_error_when_variable_contains_multiple_modes(df_na): def test_impute_numerical_variables(df_na): - # set up transformer imputer = CategoricalImputer( imputation_method="missing", fill_value=0, @@ -210,24 +195,19 @@ def test_impute_numerical_variables(df_na): ) X_transformed = imputer.fit_transform(df_na) - # set up expected output X_reference = df_na.copy() X_reference = X_reference.fillna(0) - # test init params assert imputer.imputation_method == "missing" assert imputer.variables == ["Name", "City", "Studies", "Age", "Marks"] - # test fit attributes assert imputer.variables_ == ["Name", "City", "Studies", "Age", "Marks"] assert imputer.n_features_in_ == 6 - # test transform params pd.testing.assert_frame_equal(X_transformed, X_reference) def test_impute_numerical_variables_with_mode(df_na): - # set up transformer imputer = CategoricalImputer( imputation_method="frequent", variables=["City", "Studies", "Marks"], @@ -235,16 +215,13 @@ def test_impute_numerical_variables_with_mode(df_na): ) X_transformed = imputer.fit_transform(df_na) - # set up expected output X_reference = df_na.copy() X_reference["City"] = X_reference["City"].fillna("London") X_reference["Studies"] = X_reference["Studies"].fillna("Bachelor") X_reference["Marks"] = X_reference["Marks"].fillna(0.8) - # test init params assert imputer.variables == ["City", "Studies", "Marks"] - # test fit attributes assert imputer.variables_ == ["City", "Studies", "Marks"] assert imputer.n_features_in_ == 6 assert imputer.imputer_dict_ == { @@ -253,7 +230,6 @@ def test_impute_numerical_variables_with_mode(df_na): "Marks": 0.8, } - # test transform output pd.testing.assert_frame_equal(X_transformed, X_reference) @@ -265,7 +241,6 @@ def test_variables_cast_as_category_missing(df_na): imputer = CategoricalImputer(imputation_method="missing", variables=None) X_transformed = imputer.fit_transform(df_na) - # set up expected output X_reference = df_na.copy() X_reference["Name"] = X_reference["Name"].fillna("Missing") X_reference["Studies"] = X_reference["Studies"].fillna("Missing") @@ -274,7 +249,6 @@ def test_variables_cast_as_category_missing(df_na): X_reference["City"].cat.add_categories("Missing").fillna("Missing") ) - # test fit attributes assert imputer.variables_ == ["Name", "City", "Studies"] assert imputer.imputer_dict_ == { "Name": "Missing", @@ -282,9 +256,6 @@ def test_variables_cast_as_category_missing(df_na): "Studies": "Missing", } - # test transform output - # selected columns should have no NA - # non selected columns should still have NA assert X_transformed[["Name", "City", "Studies"]].isnull().sum().sum() == 0 assert X_transformed[["Age", "Marks"]].isnull().sum().sum() > 0 pd.testing.assert_frame_equal(X_transformed, X_reference) @@ -294,27 +265,21 @@ def test_variables_cast_as_category_frequent(df_na): df_na = df_na.copy() df_na["City"] = df_na["City"].astype("category") - # this variable does not have a mode, so drop df_na.drop(labels=["Name"], axis=1, inplace=True) imputer = CategoricalImputer(imputation_method="frequent", variables=None) X_transformed = imputer.fit_transform(df_na) - # set up expected output X_reference = df_na.copy() X_reference["Studies"] = X_reference["Studies"].fillna("Bachelor") X_reference["City"] = X_reference["City"].fillna("London") - # test fit attributes assert imputer.variables_ == ["City", "Studies"] assert imputer.imputer_dict_ == { "City": "London", "Studies": "Bachelor", } - # test transform output - # selected columns should have no NA - # non selected columns should still have NA assert X_transformed[["City", "Studies"]].isnull().sum().sum() == 0 assert X_transformed[["Age", "Marks"]].isnull().sum().sum() > 0 pd.testing.assert_frame_equal(X_transformed, X_reference) @@ -334,8 +299,6 @@ def test_multimodal_raises_errors(multimodal_df): imputer = CategoricalImputer(imputation_method="frequent") msg = ( "The variable(s) city, country contain(s) multiple frequent categories. " - "Set multimodal='warn' or multimodal='ignore' to allow imputation " - "using the first most frequent category found." ) with pytest.raises(ValueError, match=re.escape(msg)): imputer.fit(multimodal_df) @@ -343,7 +306,6 @@ def test_multimodal_raises_errors(multimodal_df): @pytest.mark.parametrize("multimodal", ["warn", "ignore"]) def test_multimodal_imputation_result(multimodal_df, multimodal): - """Check that result is the same when multimodal='warn' or 'ignore'.""" imputer = CategoricalImputer(imputation_method="frequent", multimodal=multimodal) if multimodal == "warn": with pytest.warns(UserWarning, match="multiple frequent categories"): @@ -352,7 +314,6 @@ def test_multimodal_imputation_result(multimodal_df, multimodal): with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") imputer.fit(multimodal_df) - # Check that no warnings with the specific message were raised matching_warnings = [ msg for msg in w if "multiple frequent categories" in str(msg.message) ] @@ -361,13 +322,11 @@ def test_multimodal_imputation_result(multimodal_df, multimodal): @pytest.mark.parametrize("multimodal", ["bad_value", 1, True]) def test_multimodal_invalid_value_raises(multimodal): - """Passing an unsupported value for multimodal should raise ValueError at init.""" with pytest.raises(ValueError, match="multimodal takes only values"): CategoricalImputer(imputation_method="frequent", multimodal=multimodal) def test_multimodal_param_ignored_when_imputation_method_is_missing(): - """multimodal param has no effect for imputation_method='missing'.""" df = pd.DataFrame({"city": ["London", np.nan, "Paris"]}) imputer = CategoricalImputer(imputation_method="missing", multimodal="warn") # Should fit without warnings since there's no mode computation @@ -390,26 +349,32 @@ def test_warning_when_single_variable_is_multimodal(multimodal_df): def test_warning_when_single_variable_in_list_is_multimodal(multimodal_df): - # Test for multimodal='warn' when passing 1 variable in a list - # to the variables parameter. imputer = CategoricalImputer( imputation_method="frequent", variables=["city"], multimodal="warn" ) with pytest.warns(UserWarning) as record: imputer.fit(multimodal_df) - # check that warning was raised exactly once assert len(record) == 1 - # check that warning message is as expected assert "Variable city has multiple frequent categories" in str(record[0].message) assert imputer.imputer_dict_["city"] == multimodal_df["city"].mode()[0] +def test_ignore_when_single_variable_is_multimodal(multimodal_df): + imputer = CategoricalImputer( + imputation_method="frequent", variables="city", multimodal="ignore" + ) + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + imputer.fit(multimodal_df) + matching_warnings = [ + msg for msg in w if "multiple frequent categories" in str(msg.message) + ] + assert len(matching_warnings) == 0 + assert imputer.imputer_dict_["city"] == multimodal_df["city"].mode()[0] + + def test_multimodal_raise_when_only_one_variable_is_multimodal(multimodal_df): - """ - This branch is reached when multiple variables are selected but only ONE of them - turns out to have multiple modes. - """ imputer = CategoricalImputer( imputation_method="frequent", variables=["city", "one_mode"], multimodal="raise" ) From c77e8f178b49258e62825fcf8fb9edabf98ec011 Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Fri, 27 Mar 2026 21:10:33 +0530 Subject: [PATCH 27/32] Update .gitignore --- .gitignore | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 0096d1595..399a7473b 100644 --- a/.gitignore +++ b/.gitignore @@ -86,7 +86,6 @@ celerybeat-schedule # Environments .env .venv -.venv_wsl env/ venv/ ENV/ @@ -112,4 +111,4 @@ venv.bak/ *.csv *.DS_Store *.db -*.pptx \ No newline at end of file +*.pptx From a22f586ab82f15c26162663a4ea5f950a8bfc889 Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Fri, 27 Mar 2026 21:14:40 +0530 Subject: [PATCH 28/32] removed the spaces --- feature_engine/imputation/categorical.py | 6 ------ tests/test_imputation/test_categorical_imputer.py | 7 ------- 2 files changed, 13 deletions(-) diff --git a/feature_engine/imputation/categorical.py b/feature_engine/imputation/categorical.py index 6c927f6ce..a64b3a49b 100644 --- a/feature_engine/imputation/categorical.py +++ b/feature_engine/imputation/categorical.py @@ -258,12 +258,8 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): def transform(self, X: pd.DataFrame) -> pd.DataFrame: if self.imputation_method == "frequent": X = super().transform(X) - - else: X = self._transform(X) - - add_cats = {} for variable in self.variables_: if X[variable].dtype.name == "category": @@ -277,10 +273,8 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: X = X.assign(**add_cats).fillna(self.imputer_dict_) - if self.return_object: X[self.variables_] = X[self.variables_].astype("O") - return X transform.__doc__ = BaseImputer.transform.__doc__ diff --git a/tests/test_imputation/test_categorical_imputer.py b/tests/test_imputation/test_categorical_imputer.py index bf216b4d0..a179d0e80 100644 --- a/tests/test_imputation/test_categorical_imputer.py +++ b/tests/test_imputation/test_categorical_imputer.py @@ -24,21 +24,17 @@ def multimodal_df(): def test_impute_with_string_missing_and_automatically_find_variables(df_na): - # set up transformer imputer = CategoricalImputer(imputation_method="missing", variables=None) X_transformed = imputer.fit_transform(df_na) - # set up expected output X_reference = df_na.copy() X_reference["Name"] = X_reference["Name"].fillna("Missing") X_reference["City"] = X_reference["City"].fillna("Missing") X_reference["Studies"] = X_reference["Studies"].fillna("Missing") - # test init params assert imputer.imputation_method == "missing" assert imputer.variables is None - # test fit attributes assert imputer.variables_ == ["Name", "City", "Studies"] assert imputer.n_features_in_ == 6 assert imputer.imputer_dict_ == { @@ -47,9 +43,6 @@ def test_impute_with_string_missing_and_automatically_find_variables(df_na): "Studies": "Missing", } - # test transform output - # selected columns should have no NA - # non selected columns should still have NA assert X_transformed[["Name", "City", "Studies"]].isnull().sum().sum() == 0 assert X_transformed[["Age", "Marks"]].isnull().sum().sum() > 0 pd.testing.assert_frame_equal(X_transformed, X_reference) From 7156d2806ad6be6630794f9e6beb72093e172637 Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Fri, 27 Mar 2026 21:17:40 +0530 Subject: [PATCH 29/32] removed the spaces --- feature_engine/imputation/categorical.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/feature_engine/imputation/categorical.py b/feature_engine/imputation/categorical.py index a64b3a49b..5bc772e4c 100644 --- a/feature_engine/imputation/categorical.py +++ b/feature_engine/imputation/categorical.py @@ -252,7 +252,6 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): self.imputer_dict_ = mode_vals.iloc[0].to_dict() self._get_feature_names_in(X) - return self def transform(self, X: pd.DataFrame) -> pd.DataFrame: @@ -272,7 +271,7 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: ) X = X.assign(**add_cats).fillna(self.imputer_dict_) - + if self.return_object: X[self.variables_] = X[self.variables_].astype("O") return X From 5d65fe8b425bb970d065ce9376170859423c1e9e Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Fri, 27 Mar 2026 21:25:15 +0530 Subject: [PATCH 30/32] simplified the test case as asked --- .../test_categorical_imputer.py | 27 +++---------------- 1 file changed, 4 insertions(+), 23 deletions(-) diff --git a/tests/test_imputation/test_categorical_imputer.py b/tests/test_imputation/test_categorical_imputer.py index a179d0e80..7be36a754 100644 --- a/tests/test_imputation/test_categorical_imputer.py +++ b/tests/test_imputation/test_categorical_imputer.py @@ -1,4 +1,3 @@ -import re import warnings import numpy as np @@ -149,33 +148,18 @@ def test_error_when_imputation_method_not_frequent_or_missing(): def test_error_when_variable_contains_multiple_modes(df_na): - msg = ( - "The variable Name contains multiple frequent categories. " - "Set multimodal='warn' or multimodal='ignore' to allow imputation " - "using the first most frequent category found." - ) imputer = CategoricalImputer(imputation_method="frequent", variables="Name") - with pytest.raises(ValueError, match=re.escape(msg)): + with pytest.raises(ValueError, match="The variable Name contains multiple frequent categories"): imputer.fit(df_na) - msg = ( - "The variable(s) Name contain(s) multiple frequent categories. " - "Set multimodal='warn' or multimodal='ignore' to allow imputation " - "using the first most frequent category found." - ) imputer = CategoricalImputer(imputation_method="frequent") - with pytest.raises(ValueError, match=re.escape(msg)): + with pytest.raises(ValueError, match="The variable\(s\) Name contain\(s\) multiple frequent categories"): imputer.fit(df_na) df_ = df_na.copy() df_["Name_dup"] = df_["Name"] - msg = ( - "The variable(s) Name, Name_dup contain(s) multiple frequent categories. " - "Set multimodal='warn' or multimodal='ignore' to allow imputation " - "using the first most frequent category found." - ) imputer = CategoricalImputer(imputation_method="frequent") - with pytest.raises(ValueError, match=re.escape(msg)): + with pytest.raises(ValueError, match="The variable\(s\) Name, Name_dup contain\(s\) multiple frequent categories"): imputer.fit(df_) @@ -290,10 +274,7 @@ def test_error_when_ignore_format_is_not_boolean(ignore_format): def test_multimodal_raises_errors(multimodal_df): imputer = CategoricalImputer(imputation_method="frequent") - msg = ( - "The variable(s) city, country contain(s) multiple frequent categories. " - ) - with pytest.raises(ValueError, match=re.escape(msg)): + with pytest.raises(ValueError, match="multiple frequent categories"): imputer.fit(multimodal_df) From a95f5e03c041a4bb1d133ef2be8db8918ca66ff1 Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Fri, 27 Mar 2026 21:28:50 +0530 Subject: [PATCH 31/32] simplified the test case as asked --- feature_engine/imputation/categorical.py | 2 +- tests/test_imputation/test_categorical_imputer.py | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/feature_engine/imputation/categorical.py b/feature_engine/imputation/categorical.py index 5bc772e4c..e7200287c 100644 --- a/feature_engine/imputation/categorical.py +++ b/feature_engine/imputation/categorical.py @@ -271,7 +271,7 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: ) X = X.assign(**add_cats).fillna(self.imputer_dict_) - + if self.return_object: X[self.variables_] = X[self.variables_].astype("O") return X diff --git a/tests/test_imputation/test_categorical_imputer.py b/tests/test_imputation/test_categorical_imputer.py index 7be36a754..5735fdf46 100644 --- a/tests/test_imputation/test_categorical_imputer.py +++ b/tests/test_imputation/test_categorical_imputer.py @@ -153,13 +153,15 @@ def test_error_when_variable_contains_multiple_modes(df_na): imputer.fit(df_na) imputer = CategoricalImputer(imputation_method="frequent") - with pytest.raises(ValueError, match="The variable\(s\) Name contain\(s\) multiple frequent categories"): + msg = r"The variable\(s\) Name contain\(s\) multiple frequent categories" + with pytest.raises(ValueError, match=msg): imputer.fit(df_na) df_ = df_na.copy() df_["Name_dup"] = df_["Name"] imputer = CategoricalImputer(imputation_method="frequent") - with pytest.raises(ValueError, match="The variable\(s\) Name, Name_dup contain\(s\) multiple frequent categories"): + msg = r"The variable\(s\) Name, Name_dup contain\(s\) multiple frequent categories" + with pytest.raises(ValueError, match=msg): imputer.fit(df_) From 6f5b4da923f3503b404907dcda1e1e3e54eab142 Mon Sep 17 00:00:00 2001 From: Direk Kakkar Date: Fri, 27 Mar 2026 21:32:42 +0530 Subject: [PATCH 32/32] simplified the test case as asked --- tests/test_imputation/test_categorical_imputer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_imputation/test_categorical_imputer.py b/tests/test_imputation/test_categorical_imputer.py index 5735fdf46..b1d36da8c 100644 --- a/tests/test_imputation/test_categorical_imputer.py +++ b/tests/test_imputation/test_categorical_imputer.py @@ -149,7 +149,8 @@ def test_error_when_imputation_method_not_frequent_or_missing(): def test_error_when_variable_contains_multiple_modes(df_na): imputer = CategoricalImputer(imputation_method="frequent", variables="Name") - with pytest.raises(ValueError, match="The variable Name contains multiple frequent categories"): + msg = "The variable Name contains multiple frequent categories" + with pytest.raises(ValueError, match=msg): imputer.fit(df_na) imputer = CategoricalImputer(imputation_method="frequent")