diff --git a/docs/src/api.rst b/docs/src/api.rst index f257b60a5..f11165742 100644 --- a/docs/src/api.rst +++ b/docs/src/api.rst @@ -15,6 +15,7 @@ Base Classes base_variable_importance.BaseVariableImportance base_perturbation.BasePerturbation + base_variable_importance.GroupVariableImportanceMixin Feature Importance Classes ========================== diff --git a/docs/src/concepts.rst b/docs/src/concepts.rst index 30524faea..70a61aa2e 100644 --- a/docs/src/concepts.rst +++ b/docs/src/concepts.rst @@ -3,4 +3,4 @@ ====================== Definition of concepts -====================== \ No newline at end of file +====================== diff --git a/docs/src/glm_methods.rst b/docs/src/glm_methods.rst index 508a47e23..ec4831bf1 100644 --- a/docs/src/glm_methods.rst +++ b/docs/src/glm_methods.rst @@ -9,4 +9,4 @@ GLM methods :maxdepth: 2 glm_methods/desparsified_lasso.rst - glm_methods/knockoffs.rst \ No newline at end of file + glm_methods/knockoffs.rst diff --git a/docs/src/grouping.rst b/docs/src/grouping.rst index cc9412b83..6e1681a49 100644 --- a/docs/src/grouping.rst +++ b/docs/src/grouping.rst @@ -3,4 +3,4 @@ ========================================== Measuring the importance of feature groups -========================================== \ No newline at end of file +========================================== diff --git a/docs/src/high_dimension.rst b/docs/src/high_dimension.rst index 41108d48e..e655fe368 100644 --- a/docs/src/high_dimension.rst +++ b/docs/src/high_dimension.rst @@ -3,4 +3,4 @@ =========================== Inference in high dimension -=========================== \ No newline at end of file +=========================== diff --git a/docs/src/marginal_methods.rst b/docs/src/marginal_methods.rst index 630da8a73..cde7450a0 100644 --- a/docs/src/marginal_methods.rst +++ b/docs/src/marginal_methods.rst @@ -8,4 +8,4 @@ Marginal methods .. toctree:: :maxdepth: 2 - marginal_methods/leave_one_covariate_in.rst \ No newline at end of file + marginal_methods/leave_one_covariate_in.rst diff --git a/docs/src/model_agnostic_methods/leave_one_covariate_out.rst b/docs/src/model_agnostic_methods/leave_one_covariate_out.rst index 392ca6a7b..1518a84b7 100644 --- a/docs/src/model_agnostic_methods/leave_one_covariate_out.rst +++ b/docs/src/model_agnostic_methods/leave_one_covariate_out.rst @@ -5,4 +5,4 @@ Leave-One-Covariate-Out ======================== -TODO: Write this section. \ No newline at end of file +TODO: Write this section. diff --git a/docs/src/model_agnostic_methods/total_sobol_index.rst b/docs/src/model_agnostic_methods/total_sobol_index.rst index 1f3083787..9ce6fd089 100644 --- a/docs/src/model_agnostic_methods/total_sobol_index.rst +++ b/docs/src/model_agnostic_methods/total_sobol_index.rst @@ -16,4 +16,4 @@ where :math:`X^{-j}` denotes the feature vector without the :math:`j^{th}` featu :math:`\mu_{-j}(X^{-j})` is the same predictive model as :math:`\mu(X)` but retrained on the reduced feature set :math:`X^{-j}`. When :math:`\mathcal{L}` is the squared loss, for a regression task, :math:`\mu_{-j}(X^{-j}) = \mathbb{E}[y | X^{-j}]` and when -:math:`\mathcal{L}` is the log-loss, for a classification task, :math:`\mu_{-j}(X^{-j}) = P(y | X^{-j})`. \ No newline at end of file +:math:`\mathcal{L}` is the log-loss, for a classification task, :math:`\mu_{-j}(X^{-j}) = P(y | X^{-j})`. diff --git a/docs/src/visualization.rst b/docs/src/visualization.rst index 6562ae28d..18e815cb8 100644 --- a/docs/src/visualization.rst +++ b/docs/src/visualization.rst @@ -3,4 +3,4 @@ ======================= Tools for visualization -======================= \ No newline at end of file +======================= diff --git a/examples/plot_cfi.py b/examples/plot_cfi.py index 33a54a7e9..5d8b6bafd 100644 --- a/examples/plot_cfi.py +++ b/examples/plot_cfi.py @@ -72,12 +72,14 @@ loss=log_loss, method="predict_proba", imputation_model_continuous=RidgeCV(), + features_groups={ + feat_name: [i] for i, feat_name in enumerate(load_wine().feature_names) + }, random_state=0, ) cfi.fit( X_train, y_train, - groups={feat_name: [i] for i, feat_name in enumerate(load_wine().feature_names)}, ) importances = cfi.importance(X_test, y_test) diff --git a/examples/plot_importance_classification_iris.py b/examples/plot_importance_classification_iris.py index 3d33fc2e5..b1d87ff86 100644 --- a/examples/plot_importance_classification_iris.py +++ b/examples/plot_importance_classification_iris.py @@ -69,7 +69,7 @@ def run_one_fold( train_index, test_index, vim_name="CFI", - groups=None, + features_groups=None, ): model_c = clone(model) model_c.fit(X[train_index], y[train_index]) @@ -94,6 +94,7 @@ def run_one_fold( random_state=2, method=method, loss=loss, + features_groups=features_groups, ) elif vim_name == "PFI": vim = PFI( @@ -102,14 +103,15 @@ def run_one_fold( random_state=3, method=method, loss=loss, + features_groups=features_groups, ) - vim.fit(X[train_index], y[train_index], groups=groups) + vim.fit(X[train_index], y[train_index]) importance = vim.importance(X[test_index], y[test_index])["importance"] return pd.DataFrame( { - "feature": groups.keys(), + "feature": features_groups.keys(), "importance": importance, "vim": vim_name, "model": model_name, @@ -140,10 +142,16 @@ def run_one_fold( ), ] cv = KFold(n_splits=5, shuffle=True, random_state=6) -groups = {ft: [i] for i, ft in enumerate(dataset.feature_names)} +features_groups = {ft: [i] for i, ft in enumerate(dataset.feature_names)} out_list = Parallel(n_jobs=5)( delayed(run_one_fold)( - X, y, model, train_index, test_index, vim_name=vim_name, groups=groups + X, + y, + model, + train_index, + test_index, + vim_name=vim_name, + features_groups=features_groups, ) for train_index, test_index in cv.split(X) for model in models @@ -279,16 +287,22 @@ def plot_results(df_importance, df_pval): # mitigate this issue, we can group correlated features together and measure the # importance of these feature groups. For instance, we can group 'sepal width' with # 'sepal length' and 'petal length' with 'petal width' and the spurious feature. -groups = {"sepal features": [0, 1], "petal features": [2, 3, 4]} +features_groups = {"sepal features": [0, 1], "petal features": [2, 3, 4]} out_list = Parallel(n_jobs=5)( delayed(run_one_fold)( - X, y, model, train_index, test_index, vim_name=vim_name, groups=groups + X, + y, + model, + train_index, + test_index, + vim_name=vim_name, + features_groups=features_groups, ) for train_index, test_index in cv.split(X) for model in models for vim_name in ["CFI", "PFI"] ) -df_grouped = pd.concat(out_list) -df_pval = compute_pval(df_grouped, threshold=threshold) -plot_results(df_grouped, df_pval) +df_features_grouped = pd.concat(out_list) +df_pval = compute_pval(df_features_grouped, threshold=threshold) +plot_results(df_features_grouped, df_pval) diff --git a/pyproject.toml b/pyproject.toml index aced01826..b9b9241f2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -58,7 +58,7 @@ style = ["black >= 24.4.2", "codespell >=2.4.0", "isort >= 5.13.2"] test = [ "coverage >= 6.0, < 8", "iniconfig >= 0.1, < 3", - "matplotlib >= 3.1.0, < 4", + "matplotlib >= 3.4.0, < 4", "packaging >= 14.0, < 100", "pytest >= 8.0, < 9", "pytest-cov >= 5.0, < 8", diff --git a/src/hidimstat/base_perturbation.py b/src/hidimstat/base_perturbation.py index b025659e2..11d3d7aac 100644 --- a/src/hidimstat/base_perturbation.py +++ b/src/hidimstat/base_perturbation.py @@ -1,24 +1,23 @@ -import numbers -import warnings - import numpy as np -import pandas as pd from joblib import Parallel, delayed from sklearn.base import check_is_fitted from sklearn.metrics import root_mean_squared_error -from hidimstat._utils.exception import InternalError from hidimstat._utils.utils import _check_vim_predict_method, check_random_state -from hidimstat.base_variable_importance import BaseVariableImportance +from hidimstat.base_variable_importance import ( + BaseVariableImportance, + GroupVariableImportanceMixin, +) -class BasePerturbation(BaseVariableImportance): +class BasePerturbation(BaseVariableImportance, GroupVariableImportanceMixin): def __init__( self, estimator, loss: callable = root_mean_squared_error, n_permutations: int = 50, method: str = "predict", + features_groups=None, n_jobs: int = 1, random_state=None, ): @@ -42,6 +41,10 @@ def __init__( The method used for making predictions. This determines the predictions passed to the loss function. Supported methods are "predict", "predict_proba", "decision_function", "transform". + features_groups: dict or None, default=None + A dictionary where the keys are the group names and the values are the + list of column names corresponding to each features group. If None, + the features_groups are identified based on the columns of X. n_jobs : int, default=1 The number of parallel jobs to run. Parallelization is done over the variables or groups of variables. @@ -57,46 +60,40 @@ def __init__( self.method = method self.n_jobs = n_jobs self.n_permutations = n_permutations - self.n_groups = None + GroupVariableImportanceMixin.__init__(self, features_groups=features_groups) self.random_state = random_state - def fit(self, X, y=None, groups=None): - """Base fit method for perturbation-based methods. Identifies the groups. + def fit(self, X, y=None): + """ + Initialize feature groups based on input data. Parameters ---------- - X: array-like of shape (n_samples, n_features) - The input samples. - y: array-like of shape (n_samples,) - Not used, only present for consistency with the sklearn API. - groups: dict, optional - A dictionary where the keys are the group names and the values are the - list of column names corresponding to each group. If None, the groups are - identified based on the columns of X. + X : array-like of shape (n_samples, n_features) + The training input samples. + y : array-like, optional + The target values. Not used, present for API consistency. + Defaults to None. + + Returns + ------- + self : object + Returns the instance itself to enable method chaining. + + See Also + -------- + hidimstat.base_variable_importance.GroupVariableImportanceMixin.fit : Parent class fit method that performs the actual initialization. """ - if groups is None: - self.n_groups = X.shape[1] - self.groups = {j: [j] for j in range(self.n_groups)} - self._groups_ids = np.array(list(self.groups.values()), dtype=int) - elif isinstance(groups, dict): - self.n_groups = len(groups) - self.groups = groups - if isinstance(X, pd.DataFrame): - self._groups_ids = [] - for group_key in self.groups.keys(): - self._groups_ids.append( - [ - i - for i, col in enumerate(X.columns) - if col in self.groups[group_key] - ] - ) - else: - self._groups_ids = [ - np.array(ids, dtype=int) for ids in list(self.groups.values()) - ] - else: - raise ValueError("groups needs to be a dictionary") + GroupVariableImportanceMixin.fit(self, X, y) + return self + + def _check_fit(self): + """Check if the instance has been fitted.""" + GroupVariableImportanceMixin._check_fit(self) + + def _check_compatibility(self, X): + """Check compatibility between input data and fitted model.""" + GroupVariableImportanceMixin._check_compatibility(self, X) def predict(self, X): """ @@ -113,17 +110,18 @@ def predict(self, X): out: array-like of shape (n_groups, n_permutations, n_samples) The predictions after perturbation of the data for each group of variables. """ - self._check_fit(X) + self._check_fit() + self._check_compatibility(X) X_ = np.asarray(X) rng = check_random_state(self.random_state) # Parallelize the computation of the importance scores for each group out_list = Parallel(n_jobs=self.n_jobs)( - delayed(self._joblib_predict_one_group)( - X_, group_id, group_key, random_state=child_state + delayed(self._joblib_predict_one_features_group)( + X_, features_group_id, features_group_key, random_state=child_state ) - for group_id, (group_key, child_state) in enumerate( - zip(self.groups.keys(), rng.spawn(self.n_groups)) + for features_group_id, (features_group_key, child_state) in enumerate( + zip(self.features_groups.keys(), rng.spawn(self.n_features_groups_)) ) ) return np.stack(out_list, axis=0) @@ -148,7 +146,8 @@ def importance(self, X, y): for each group. - 'importance': the importance scores for each group. """ - self._check_fit(X) + GroupVariableImportanceMixin._check_fit(self) + GroupVariableImportanceMixin._check_compatibility(self, X) out_dict = dict() @@ -167,81 +166,15 @@ def importance(self, X, y): out_dict["importance"] = np.array( [ np.mean(out_dict["loss"][j]) - loss_reference - for j in range(self.n_groups) + for j in range(self.n_features_groups_) ] ) self.importances_ = out_dict["importance"] return out_dict - def _check_fit(self, X): - """ - Check if the perturbation method has been properly fitted. - - This method verifies that the perturbation method has been fitted by checking - if required attributes are set and if the number of features matches - the grouped variables. - - Parameters - ---------- - X : array-like of shape (n_samples, n_features) - Input data to validate against the fitted model. - - Raises - ------ - ValueError - If the method has not been fitted (i.e., if n_groups, groups, - or _groups_ids attributes are missing). - AssertionError - If the number of features in X does not match the total number - of features in the grouped variables. - """ - if ( - self.n_groups is None - or not hasattr(self, "groups") - or not hasattr(self, "_groups_ids") - ): - raise ValueError( - "The class is not fitted. The fit method must be called" - " to set variable groups. If no grouping is needed," - " call fit with groups=None" - ) - if isinstance(X, pd.DataFrame): - names = list(X.columns) - elif isinstance(X, np.ndarray) and X.dtype.names is not None: - names = X.dtype.names - # transform Structured Array in pandas array for a better manipulation - X = pd.DataFrame(X) - elif isinstance(X, np.ndarray): - names = None - else: - raise ValueError("X should be a pandas dataframe or a numpy array.") - number_columns = X.shape[1] - for index_variables in self.groups.values(): - if isinstance(index_variables[0], numbers.Integral): - assert np.all( - np.array(index_variables, dtype=int) < number_columns - ), "X does not correspond to the fitting data." - elif type(index_variables[0]) is str or np.issubdtype( - type(index_variables[0]), str - ): - assert np.all( - [name in names for name in index_variables] - ), f"The array is missing at least one of the following columns {index_variables}." - else: - raise InternalError( - "A problem with indexing has happened during the fit." - ) - number_unique_feature_in_groups = np.unique( - np.concatenate([values for values in self.groups.values()]) - ).shape[0] - if X.shape[1] != number_unique_feature_in_groups: - warnings.warn( - f"The number of features in X: {X.shape[1]} differs from the" - " number of features for which importance is computed: " - f"{number_unique_feature_in_groups}" - ) - - def _joblib_predict_one_group(self, X, group_id, group_key, random_state=None): + def _joblib_predict_one_features_group( + self, X, features_group_id, features_group_key, random_state=None + ): """ Compute the predictions after perturbation of the data for a given group of variables. This function is parallelized. @@ -250,21 +183,21 @@ def _joblib_predict_one_group(self, X, group_id, group_key, random_state=None): ---------- X: array-like of shape (n_samples, n_features) The input samples. - group_id: int + features_group_id: int The index of the group of variables. - group_key: str, int + features_group_key: str, int The key of the group of variables. (parameter use for debugging) random_state: The random state to use for sampling. """ - group_ids = self._groups_ids[group_id] - non_group_ids = np.delete(np.arange(X.shape[1]), group_ids) + features_group_ids = self._features_groups_ids[features_group_id] + non_features_group_ids = np.delete(np.arange(X.shape[1]), features_group_ids) # Create an array X_perm_j of shape (n_permutations, n_samples, n_features) # where the j-th group of covariates is permuted X_perm = np.empty((self.n_permutations, X.shape[0], X.shape[1])) - X_perm[:, :, non_group_ids] = np.delete(X, group_ids, axis=1) - X_perm[:, :, group_ids] = self._permutation( - X, group_id=group_id, random_state=random_state + X_perm[:, :, non_features_group_ids] = np.delete(X, features_group_ids, axis=1) + X_perm[:, :, features_group_ids] = self._permutation( + X, features_group_id=features_group_id, random_state=random_state ) # Reshape X_perm to allow for batch prediction X_perm_batch = X_perm.reshape(-1, X.shape[1]) @@ -279,6 +212,6 @@ def _joblib_predict_one_group(self, X, group_id, group_key, random_state=None): ) return y_pred_perm - def _permutation(self, X, group_id, random_state=None): + def _permutation(self, X, features_group_id, random_state=None): """Method for creating the permuted data for the j-th group of covariates.""" raise NotImplementedError diff --git a/src/hidimstat/base_variable_importance.py b/src/hidimstat/base_variable_importance.py index c971663af..3e8adf338 100644 --- a/src/hidimstat/base_variable_importance.py +++ b/src/hidimstat/base_variable_importance.py @@ -1,9 +1,12 @@ +import numbers import warnings import numpy as np import pandas as pd from sklearn.base import BaseEstimator +from hidimstat._utils.exception import InternalError + class BaseVariableImportance(BaseEstimator): """ @@ -137,6 +140,7 @@ def plot_importance( self, ax=None, ascending=False, + feature_names=None, **seaborn_barplot_kwargs, ): """ @@ -167,7 +171,18 @@ def plot_importance( if ax is None: _, ax = plt.subplots() - feature_names = list(self.groups.keys()) + + if feature_names is None: + if hasattr(self, "features_groups"): + feature_names = list(self.features_groups.keys()) + else: + feature_names = [str(j) for j in range(self.importances_.shape[-1])] + elif isinstance(feature_names, list): + assert np.all( + isinstance(name, str) for name in feature_names + ), "The feature_names should be a list of the string" + else: + raise ValueError("feature_names should be a list") if self.importances_.ndim == 2: df_plot = { @@ -197,3 +212,155 @@ def plot_importance( sns.despine(ax=ax) ax.set_ylabel("") return ax + + +class GroupVariableImportanceMixin: + """ + Mixin class for adding group functionality to variable importance methods. + This class provides functionality for handling grouped features in variable + importance calculations, enabling group-wise selection and importance evaluation. + + Parameters + ---------- + features_groups: dict or None, default=None + Dictionary mapping group names to lists of feature column names/indices. + If None, each feature is treated as its own group. + + Attributes + ---------- + n_features_groups_ : int + Number of feature groups. + _features_groups_ids : array-like + List of feature indices for each group. + + Methods + ------- + fit(X, y=None) + Identifies feature groups and validates input data structure. + _check_fit() + Verifies if the instance has been fitted. + _check_compatibility(X) + Validates compatibility between input data and fitted groups. + """ + + def __init__(self, features_groups=None): + super().__init__() + self.features_groups = features_groups + self.n_features_groups_ = None + self._features_groups_ids = None + + def fit(self, X, y=None): + """ + Base fit method for perturbation-based methods. Identifies the groups. + + Parameters + ---------- + X: array-like of shape (n_samples, n_features) + The input samples. + y: array-like of shape (n_samples,) + Not used, only present for consistency with the sklearn API. + + Returns + ------- + self : object + Returns the instance itself. + """ + if self.features_groups is None: + self.n_features_groups_ = X.shape[1] + self.features_groups = {j: [j] for j in range(self.n_features_groups_)} + self._features_groups_ids = np.array( + sorted(list(self.features_groups.values())), dtype=int + ) + elif isinstance(self.features_groups, dict): + self.n_features_groups_ = len(self.features_groups) + self.features_groups = self.features_groups + if isinstance(X, pd.DataFrame): + self._features_groups_ids = [] + for features_group_key in sorted(self.features_groups.keys()): + self._features_groups_ids.append( + [ + i + for i, col in enumerate(X.columns) + if col in self.features_groups[features_group_key] + ] + ) + else: + self._features_groups_ids = [ + np.array(ids, dtype=int) + for ids in list(self.features_groups.values()) + ] + else: + raise ValueError("features_groups needs to be a dictionary") + return self + + def _check_fit(self): + """ + Check if the instance has been fitted. + + Raises + ------ + ValueError + If the class has not been fitted (i.e., if n_features_groups_ + or _features_groups_ids attributes are missing). + """ + if self.n_features_groups_ is None or self._features_groups_ids is None: + raise ValueError("The class is not fitted.") + + def _check_compatibility(self, X): + """ + Check compatibility between input data and fitted model. + + Verifies that the input data X matches the structure expected by the fitted model, + including feature names and dimensions. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Input data to validate. Can be pandas DataFrame or numpy array. + + Raises + ------ + ValueError + If X is not a pandas DataFrame or numpy array. + If column names in X don't match those used during fitting. + AssertionError + If feature indices are out of bounds. + If required feature names are missing from X. + Warning + If number of features in X differs from number of grouped features. + """ + if isinstance(X, pd.DataFrame): + names = list(X.columns) + elif isinstance(X, np.ndarray) and X.dtype.names is not None: + names = X.dtype.names + # transform Structured Array in pandas array for a better manipulation + X = pd.DataFrame(X) + elif isinstance(X, np.ndarray): + names = None + else: + raise ValueError("X should be a pandas dataframe or a numpy array.") + number_columns = X.shape[1] + for index_variables in self.features_groups.values(): + if isinstance(index_variables[0], numbers.Integral): + assert np.all( + np.array(index_variables, dtype=int) < number_columns + ), "X does not correspond to the fitting data." + elif type(index_variables[0]) is str or np.issubdtype( + type(index_variables[0]), str + ): + assert np.all( + [name in names for name in index_variables] + ), f"The array is missing at least one of the following columns {index_variables}." + else: + raise InternalError( + "A problem with indexing has happened during the fit." + ) + number_unique_feature_in_groups = np.unique( + np.concatenate([values for values in self.features_groups.values()]) + ).shape[0] + if X.shape[1] != number_unique_feature_in_groups: + warnings.warn( + f"The number of features in X: {X.shape[1]} differs from the" + " number of features for which importance is computed: " + f"{number_unique_feature_in_groups}" + ) diff --git a/src/hidimstat/conditional_feature_importance.py b/src/hidimstat/conditional_feature_importance.py index 22d723b78..8480b2012 100644 --- a/src/hidimstat/conditional_feature_importance.py +++ b/src/hidimstat/conditional_feature_importance.py @@ -13,12 +13,14 @@ def __init__( estimator, loss: callable = root_mean_squared_error, method: str = "predict", - n_jobs: int = 1, n_permutations: int = 50, imputation_model_continuous=None, imputation_model_categorical=None, - random_state: int = None, + features_groups=None, + feature_types="auto", categorical_max_cardinality: int = 10, + n_jobs: int = 1, + random_state: int = None, ): """ Conditional Feature Importance (CFI) algorithm. @@ -36,24 +38,32 @@ def __init__( The method to use for the prediction. This determines the predictions passed to the loss function. Supported methods are "predict", "predict_proba" or "decision_function". - n_jobs : int, default=1 - The number of jobs to run in parallel. Parallelization is done over the - variables or groups of variables. n_permutations : int, default=50 - The number of permutations to perform. For each variable/group of variables, + The number of permutations to perform. For each feature/group of features, the mean of the losses over the `n_permutations` is computed. imputation_model_continuous : sklearn compatible estimator, optional The model used to estimate the conditional distribution of a given - continuous variable/group of variables given the others. + continuous features/group of features given the others. imputation_model_categorical : sklearn compatible estimator, optional The model used to estimate the conditional distribution of a given - categorical variable/group of variables given the others. Binary is + categorical features/group of features given the others. Binary is considered as a special case of categorical. + categorical_max_cardinality : int, default=10 + The maximum cardinality of a feature to be considered as categorical + when the feature type is inferred (set to "auto" or not provided). + features_groups: dict or None, default=None + A dictionary where the keys are the group names and the values are the + list of column names corresponding to each features group. If None, + the features_groups are identified based on the columns of X. + feature_types: str or list, default="auto" + The feature type. Supported types include "auto", "continuous", and + "categorical". If "auto", the type is inferred from the cardinality + of the unique values passed to the `fit` method. random_state : int, default=None The random state to use for sampling. - categorical_max_cardinality : int, default=10 - The maximum cardinality of a variable to be considered as categorical - when the variable type is inferred (set to "auto" or not provided). + n_jobs : int, default=1 + The number of jobs to run in parallel. Parallelization is done over the + features or groups of features. References ---------- @@ -65,6 +75,7 @@ def __init__( method=method, n_jobs=n_jobs, n_permutations=n_permutations, + features_groups=features_groups, random_state=random_state, ) @@ -76,12 +87,13 @@ def __init__( imputation_model_categorical.__class__, BaseEstimator ), "Categorial imputation model invalid" + self.feature_types = feature_types self._list_imputation_models = [] self.categorical_max_cardinality = categorical_max_cardinality self.imputation_model_categorical = imputation_model_categorical self.imputation_model_continuous = imputation_model_continuous - def fit(self, X, y=None, groups=None, var_type="auto"): + def fit(self, X, y=None): """Fit the imputation models. Parameters @@ -90,29 +102,27 @@ def fit(self, X, y=None, groups=None, var_type="auto"): The input samples. y: array-like of shape (n_samples,) Not used, only present for consistency with the sklearn API. - groups: dict, optional - A dictionary where the keys are the group names and the values are the - list of column names corresponding to each group. If None, the groups are - identified based on the columns of X. - var_type: str or list, default="auto" - The variable type. Supported types include "auto", "continuous", and - "categorical". If "auto", the type is inferred from the cardinality - of the unique values passed to the `fit` method. Returns ------- self : object Returns the instance itself. """ - super().fit(X, None, groups=groups) - - if isinstance(var_type, str): - self.var_type = [var_type for _ in range(self.n_groups)] - else: - self.var_type = var_type + super().fit(X, None) + + # check the feature type + if isinstance(self.feature_types, str): + if self.feature_types in ["auto", "continuous", "categorical"]: + self.feature_types = [ + self.feature_types for _ in range(self.n_features_groups_) + ] + else: + raise ValueError( + "feature_types support only the string 'auto', 'continuous', 'categorical'" + ) self._list_imputation_models = [ ConditionalSampler( - data_type=self.var_type[group_id], + data_type=self.feature_types[features_group_id], model_regression=( None if self.imputation_model_continuous is None @@ -125,52 +135,43 @@ def fit(self, X, y=None, groups=None, var_type="auto"): ), categorical_max_cardinality=self.categorical_max_cardinality, ) - for group_id in range(self.n_groups) + for features_group_id in range(self.n_features_groups_) ] # Parallelize the fitting of the covariate estimators X_ = np.asarray(X) self._list_imputation_models = Parallel(n_jobs=self.n_jobs)( - delayed(self._joblib_fit_one_group)(estimator, X_, groups_ids) - for groups_ids, estimator in zip( - self._groups_ids, self._list_imputation_models + delayed(self._joblib_fit_one_features_group)( + imputation_model, X_, features_groups_ids + ) + for features_groups_ids, imputation_model in zip( + self._features_groups_ids, self._list_imputation_models ) ) return self - def _joblib_fit_one_group(self, estimator, X, groups_ids): - """Fit a single imputation model, for a single group of variables. This method + def _joblib_fit_one_features_group(self, estimator, X, features_groups_ids): + """Fit a single imputation model, for a single group of features. This method is parallelized.""" - X_j = X[:, groups_ids].copy() - X_minus_j = np.delete(X, groups_ids, axis=1) + X_j = X[:, features_groups_ids].copy() + X_minus_j = np.delete(X, features_groups_ids, axis=1) estimator.fit(X_minus_j, X_j) return estimator - def _check_fit(self, X): + def _check_fit(self): """ - Check if the perturbation method and imputation models have been properly fitted. - - This method verifies that both the base perturbation method and the imputation - models have been fitted by checking required attributes and validating the input - dimensions. - - Parameters - ---------- - X : array-like of shape (n_samples, n_features) - Input data to validate against the fitted model. + Check if base class and imputation models have been fitted. Raises ------ ValueError - If the method has not been fitted (i.e., if n_groups, groups, - or _groups_ids attributes are missing) or if imputation models - are not fitted. - AssertionError - If the number of features in X does not match the total number - of features in the grouped variables. + If the class has not been fitted (i.e., if n_features_groups_ + or _features_groups_ids attributes are missing). + If the class has not been fitted or imputation models are not fitted. + """ - super()._check_fit(X) + super()._check_fit() if len(self._list_imputation_models) == 0: raise ValueError( "The imputation models require to be fitted before being used." @@ -178,11 +179,11 @@ def _check_fit(self, X): for m in self._list_imputation_models: check_is_fitted(m.model) - def _permutation(self, X, group_id, random_state=None): + def _permutation(self, X, features_group_id, random_state=None): """Sample from the conditional distribution using a permutation of the residuals.""" - X_j = X[:, self._groups_ids[group_id]].copy() - X_minus_j = np.delete(X, self._groups_ids[group_id], axis=1) - return self._list_imputation_models[group_id].sample( + X_j = X[:, self._features_groups_ids[features_group_id]].copy() + X_minus_j = np.delete(X, self._features_groups_ids[features_group_id], axis=1) + return self._list_imputation_models[features_group_id].sample( X_minus_j, X_j, n_samples=self.n_permutations, random_state=random_state ) diff --git a/src/hidimstat/leave_one_covariate_out.py b/src/hidimstat/leave_one_covariate_out.py index a199c7928..d84d6e5a3 100644 --- a/src/hidimstat/leave_one_covariate_out.py +++ b/src/hidimstat/leave_one_covariate_out.py @@ -13,18 +13,19 @@ def __init__( estimator, loss: callable = root_mean_squared_error, method: str = "predict", + features_groups=None, n_jobs: int = 1, ): """ Leave-One-Covariate-Out (LOCO) as presented in :footcite:t:`lei2018distribution` and :footcite:t:`verdinelli2024feature`. - The model is re-fitted for each variable/group of variables. The importance is + The model is re-fitted for each feature/group of features. The importance is then computed as the difference between the loss of the full model and the loss - of the model without the variable/group. + of the model without the feature/group. Parameters ---------- - estimator : sklearn compatible estimator, optional + estimator : sklearn compatible estimator The estimator to use for the prediction. loss : callable, default=root_mean_squared_error The loss function to use when comparing the perturbed model to the full @@ -33,9 +34,13 @@ def __init__( The method to use for the prediction. This determines the predictions passed to the loss function. Supported methods are "predict", "predict_proba" or "decision_function". + features_groups: dict or None, default=None + A dictionary where the keys are the group names and the values are the + list of column names corresponding to each features group. If None, + the features_groups are identified based on the columns of X. n_jobs : int, default=1 The number of jobs to run in parallel. Parallelization is done over the - variables or groups of variables. + features or groups of features. Notes ----- @@ -52,10 +57,11 @@ def __init__( method=method, n_jobs=n_jobs, n_permutations=1, + features_groups=features_groups, ) self._list_estimators = [] - def fit(self, X, y, groups=None): + def fit(self, X, y): """Fit a model after removing each covariate/group of covariates. Parameters @@ -64,48 +70,55 @@ def fit(self, X, y, groups=None): The training input samples. y : array-like of shape (n_samples,) The target values. - groups : dict, default=None - A dictionary where the keys are the group names and the values are the - indices of the covariates in each group. Returns ------- self : object Returns the instance itself. """ - super().fit(X, y, groups) + super().fit(X, y) # create a list of covariate estimators for each group if not provided - self._list_estimators = [clone(self.estimator) for _ in range(self.n_groups)] + self._list_estimators = [ + clone(self.estimator) for _ in range(self.n_features_groups_) + ] # Parallelize the fitting of the covariate estimators self._list_estimators = Parallel(n_jobs=self.n_jobs)( - delayed(self._joblib_fit_one_group)(estimator, X, y, key_groups) - for key_groups, estimator in zip(self.groups.keys(), self._list_estimators) + delayed(self._joblib_fit_one_features_group)( + estimator, X, y, key_features_groups + ) + for key_features_groups, estimator in zip( + self.features_groups.keys(), self._list_estimators + ) ) return self - def _joblib_fit_one_group(self, estimator, X, y, key_groups): + def _joblib_fit_one_features_group(self, estimator, X, y, key_features_group): """Fit the estimator after removing a group of covariates. Used in parallel.""" if isinstance(X, pd.DataFrame): - X_minus_j = X.drop(columns=self.groups[key_groups]) + X_minus_j = X.drop(columns=self.features_groups[key_features_group]) else: - X_minus_j = np.delete(X, self.groups[key_groups], axis=1) + X_minus_j = np.delete(X, self.features_groups[key_features_group], axis=1) estimator.fit(X_minus_j, y) return estimator - def _joblib_predict_one_group(self, X, group_id, key_groups, random_state=None): - """Predict the target variable after removing a group of covariates. + def _joblib_predict_one_features_group( + self, X, features_group_id, key_features_group, random_state=None + ): + """Predict the target feature after removing a group of covariates. Used in parallel.""" - X_minus_j = np.delete(X, self._groups_ids[group_id], axis=1) + X_minus_j = np.delete(X, self._features_groups_ids[features_group_id], axis=1) - y_pred_loco = getattr(self._list_estimators[group_id], self.method)(X_minus_j) + y_pred_loco = getattr(self._list_estimators[features_group_id], self.method)( + X_minus_j + ) return [y_pred_loco] - def _check_fit(self, X): + def _check_fit(self): """Check that an estimator has been fitted after removing each group of covariates.""" - super()._check_fit(X) + super()._check_fit() check_is_fitted(self.estimator) if len(self._list_estimators) == 0: raise ValueError("The estimators require to be fit before to use them") diff --git a/src/hidimstat/permutation_feature_importance.py b/src/hidimstat/permutation_feature_importance.py index ac3652a27..4a0dfc825 100644 --- a/src/hidimstat/permutation_feature_importance.py +++ b/src/hidimstat/permutation_feature_importance.py @@ -11,15 +11,16 @@ def __init__( estimator, loss: callable = root_mean_squared_error, method: str = "predict", - n_jobs: int = 1, n_permutations: int = 50, + features_groups=None, random_state: int = None, + n_jobs: int = 1, ): """ Permutation Feature Importance algorithm as presented in - :footcite:t:`breimanRandomForests2001`. For each variable/group of variables, + :footcite:t:`breimanRandomForests2001`. For each feature/group of features, the importance is computed as the difference between the loss of the initial - model and the loss of the model with the variable/group permuted. + model and the loss of the model with the feature/group permuted. The method was also used in :footcite:t:`mi2021permutation` Parameters @@ -33,14 +34,18 @@ def __init__( The method to use for the prediction. This determines the predictions passed to the loss function. Supported methods are "predict", "predict_proba" or "decision_function". - n_jobs : int, default=1 - The number of jobs to run in parallel. Parallelization is done over the - variables or groups of variables. n_permutations : int, default=50 - The number of permutations to perform. For each variable/group of variables, + The number of permutations to perform. For each feature/group of features, the mean of the losses over the `n_permutations` is computed. + features_groups: dict or None, default=None + A dictionary where the keys are the group names and the values are the + list of column names corresponding to each features group. If None, + the features_groups are identified based on the columns of X. random_state : int, default=None The random state to use for sampling. + n_jobs : int, default=1 + The number of jobs to run in parallel. Parallelization is done over the + features or groups of features. References ---------- @@ -52,15 +57,18 @@ def __init__( method=method, n_jobs=n_jobs, n_permutations=n_permutations, + features_groups=features_groups, random_state=random_state, ) - def _permutation(self, X, group_id, random_state=None): + def _permutation(self, X, features_group_id, random_state=None): """Create the permuted data for the j-th group of covariates""" rng = check_random_state(random_state) X_perm_j = np.array( [ - rng.permutation(X[:, self._groups_ids[group_id]].copy()) + rng.permutation( + X[:, self._features_groups_ids[features_group_id]].copy() + ) for _ in range(self.n_permutations) ] ) diff --git a/test/test_base_importance.py b/test/test_base_importance.py new file mode 100644 index 000000000..f50487fbb --- /dev/null +++ b/test/test_base_importance.py @@ -0,0 +1,74 @@ +import matplotlib.pyplot as plt +import numpy as np +import pytest + +from hidimstat.base_variable_importance import BaseVariableImportance + + +def test_plot_importance_axis(): + """Test argument axis of plot function""" + n_features = 10 + vi = BaseVariableImportance() + # Make the plot independent of data / randomness to test only the plotting function + vi.importances_ = np.arange(n_features) + ax_1 = vi.plot_importance(ax=None) + assert isinstance(ax_1, plt.Axes) + + _, ax_2 = plt.subplots() + vi.importances_ = np.random.standard_normal((3, n_features)) + ax_2_bis = vi.plot_importance(ax=ax_2) + assert isinstance(ax_2_bis, plt.Axes) + assert ax_2_bis == ax_2 + + +def test_plot_importance_ascending(): + """Test argument ascending of plot function""" + n_features = 10 + vi = BaseVariableImportance() + + # Make the plot independent of data / randomness to test only the plotting function + vi.importances_ = np.arange(n_features) + np.random.shuffle(vi.importances_) + + ax_decending = vi.plot_importance(ascending=False) + assert np.all( + ax_decending.containers[0].datavalues == np.flip(np.sort(vi.importances_)) + ) + + ax_ascending = vi.plot_importance(ascending=True) + assert np.all(ax_ascending.containers[0].datavalues == np.sort(vi.importances_)) + + +def test_plot_importance_feature_names(): + """Test argument feature of plot function""" + n_features = 10 + vi = BaseVariableImportance() + + # Make the plot independent of data / randomness to test only the plotting function + vi.importances_ = np.arange(n_features) + np.random.shuffle(vi.importances_) + + features_name = [str(j) for j in np.flip(np.argsort(vi.importances_))] + ax_none = vi.plot_importance(feature_names=None) + assert np.all( + np.array([label.get_text() for label in ax_none.get_yticklabels()]) + == features_name + ) + + features_name = ["features_" + str(j) for j in np.flip(np.sort(vi.importances_))] + ax_setup = vi.plot_importance(feature_names=features_name) + assert np.all( + np.array([label.get_text() for label in ax_setup.get_yticklabels()]) + == np.flip(np.array(features_name)[np.argsort(vi.importances_)]) + ) + + vi.features_groups = {str(j * 2): [] for j in np.flip(np.sort(vi.importances_))} + features_name = [str(j * 2) for j in np.flip(np.sort(vi.importances_))] + ax_none_group = vi.plot_importance(feature_names=None) + assert np.all( + np.array([label.get_text() for label in ax_none_group.get_yticklabels()]) + == np.flip(np.array(features_name)[np.argsort(vi.importances_)]) + ) + + with pytest.raises(ValueError, match="feature_names should be a list"): + ax_none_group = vi.plot_importance(feature_names="ttt") diff --git a/test/test_base_perturbation.py b/test/test_base_perturbation.py index 024f0fff6..ea29c2566 100644 --- a/test/test_base_perturbation.py +++ b/test/test_base_perturbation.py @@ -12,4 +12,4 @@ def test_no_implemented_methods(): estimator.fit(X[:, 0], X[:, 1]) basic_class = BasePerturbation(estimator=estimator) with pytest.raises(NotImplementedError): - basic_class._permutation(X, group_id=None) + basic_class._permutation(X, features_group_id=None) diff --git a/test/test_conditional_feature_importance.py b/test/test_conditional_feature_importance.py index 6245a85d9..9f6d82163 100644 --- a/test/test_conditional_feature_importance.py +++ b/test/test_conditional_feature_importance.py @@ -59,15 +59,13 @@ def run_cfi(X, y, n_permutation, seed): imputation_model_continuous=LinearRegression(), n_permutations=n_permutation, method="predict", + features_groups=None, + feature_types="auto", random_state=seed, n_jobs=1, ) # fit the model using the training set - cfi.fit( - X_train, - groups=None, - var_type="auto", - ) + cfi.fit(X_train) # calculate feature importance using the test set vim = cfi.importance(X_test, y_test) importance = vim["importance"] @@ -192,14 +190,12 @@ def test_group(data_generator): imputation_model_continuous=LinearRegression(), n_permutations=20, method="predict", + features_groups=groups, + feature_types="auto", random_state=0, n_jobs=1, ) - cfi.fit( - X_train_df, - groups=groups, - var_type="continuous", - ) + cfi.fit(X_train_df) # Warning expected since column names in pandas are not considered with pytest.warns(UserWarning, match="X does not have valid feature names, but"): vim = cfi.importance(X_test_df, y_test) @@ -241,16 +237,14 @@ def test_classication(data_generator): estimator=logistic_model, imputation_model_continuous=LinearRegression(), n_permutations=20, - random_state=0, - n_jobs=1, method="predict_proba", loss=log_loss, + features_groups=None, + feature_types=["continuous"] * X.shape[1], + random_state=0, + n_jobs=1, ) - cfi.fit( - X_train, - groups=None, - var_type=["continuous"] * X.shape[1], - ) + cfi.fit(X_train) vim = cfi.importance(X_test, y_test_clf) importance = vim["importance"] # Check that importance scores are defined for each feature @@ -300,13 +294,24 @@ def test_fit(self, data_generator): # Test fit with auto var_type cfi.fit(X) assert len(cfi._list_imputation_models) == X.shape[1] - assert cfi.n_groups == X.shape[1] + assert cfi.n_features_groups_ == X.shape[1] - # Test fit with specified groups + def test_fit_group(self, data_generator): + """Test fitting CFI with group""" + X, y, _, _ = data_generator + fitted_model = LinearRegression().fit(X, y) + # Test with specified groups groups = {"g1": [0, 1], "g2": [2, 3, 4]} - cfi.fit(X, groups=groups) + cfi = CFI( + estimator=fitted_model, + imputation_model_continuous=LinearRegression(), + features_groups=groups, + random_state=42, + ) + cfi.fit(X) + assert len(cfi._list_imputation_models) == 2 - assert cfi.n_groups == 2 + assert cfi.n_features_groups_ == 2 def test_categorical( self, @@ -327,15 +332,15 @@ def test_categorical( y = rng.random((n_samples, 1)) fitted_model = LinearRegression().fit(X, y) + feature_types = ["continuous", "continuous", "categorical"] cfi = CFI( estimator=fitted_model, imputation_model_continuous=LinearRegression(), imputation_model_categorical=LogisticRegression(), + feature_types=feature_types, random_state=0, ) - - var_type = ["continuous", "continuous", "categorical"] - cfi.fit(X, y, var_type=var_type) + cfi.fit(X, y) importances = cfi.importance(X, y)["importance"] assert len(importances) == 3 @@ -413,11 +418,13 @@ def test_invalid_type(self, data_generator): """Test invalid type of data""" X, y, _, _ = data_generator fitted_model = LinearRegression().fit(X, y) - cfi = CFI(estimator=fitted_model) + cfi = CFI(estimator=fitted_model, feature_types="invalid") # Test error when passing invalid var_type - with pytest.raises(ValueError, match="type of data 'invalid' unknown."): - cfi.fit(X, var_type="invalid") + with pytest.raises( + ValueError, match="feature_types support only the string 'auto'" + ): + cfi.fit(X) def test_invalid_n_permutations(self, data_generator): """Test when invalid number of permutations is provided""" @@ -434,9 +441,11 @@ def test_not_good_type_X(self, data_generator): cfi = CFI( estimator=fitted_model, imputation_model_continuous=LinearRegression(), + features_groups=None, + feature_types="auto", method="predict", ) - cfi.fit(X, groups=None, var_type="auto") + cfi.fit(X) with pytest.raises( ValueError, match="X should be a pandas dataframe or a numpy array." @@ -451,8 +460,10 @@ def test_mismatched_features(self, data_generator): estimator=fitted_model, imputation_model_continuous=LinearRegression(), method="predict", + features_groups=None, + feature_types="auto", ) - cfi.fit(X, groups=None, var_type="auto") + cfi.fit(X) with pytest.raises( AssertionError, match="X does not correspond to the fitting data." @@ -463,19 +474,21 @@ def test_mismatched_features_string(self, data_generator): """Test when name of features doesn't match between fit and predict""" X, y, _, _ = data_generator X = pd.DataFrame({"col_" + str(i): X[:, i] for i in range(X.shape[1])}) - fitted_model = LinearRegression().fit(X, y) - cfi = CFI( - estimator=fitted_model, - imputation_model_continuous=LinearRegression(), - method="predict", - ) subgroups = { "group1": ["col_" + str(i) for i in range(int(X.shape[1] / 2))], "group2": [ "col_" + str(i) for i in range(int(X.shape[1] / 2), X.shape[1] - 3) ], } - cfi.fit(X, groups=subgroups, var_type="auto") + fitted_model = LinearRegression().fit(X, y) + cfi = CFI( + estimator=fitted_model, + imputation_model_continuous=LinearRegression(), + method="predict", + features_groups=subgroups, + feature_types="auto", + ) + cfi.fit(X) with pytest.raises( AssertionError, @@ -489,20 +502,22 @@ def test_internal_error(self, data_generator): """Test when name of features doesn't match between fit and predict""" X, y, _, _ = data_generator X = pd.DataFrame({"col_" + str(i): X[:, i] for i in range(X.shape[1])}) - fitted_model = LinearRegression().fit(X, y) - cfi = CFI( - estimator=fitted_model, - imputation_model_continuous=LinearRegression(), - method="predict", - ) subgroups = { "group1": ["col_" + str(i) for i in range(int(X.shape[1] / 2))], "group2": [ "col_" + str(i) for i in range(int(X.shape[1] / 2), X.shape[1] - 3) ], } - cfi.fit(X, groups=subgroups, var_type="auto") - cfi.groups["group1"] = [None for i in range(100)] + fitted_model = LinearRegression().fit(X, y) + cfi = CFI( + estimator=fitted_model, + imputation_model_continuous=LinearRegression(), + method="predict", + features_groups=subgroups, + feature_types="auto", + ) + cfi.fit(X) + cfi.features_groups["group1"] = [None for i in range(100)] X = X.to_records(index=False) X = np.array(X, dtype=X.dtype.descr) @@ -516,10 +531,15 @@ def test_invalid_var_type(self, data_generator): """Test when invalid variable type is provided""" X, y, _, _ = data_generator fitted_model = LinearRegression().fit(X, y) - cfi = CFI(estimator=fitted_model, method="predict") + cfi = CFI( + estimator=fitted_model, + method="predict", + features_groups=None, + feature_types=["invalid_type"] * X.shape[1], + ) with pytest.raises(ValueError, match="type of data 'invalid_type' unknown."): - cfi.fit(X, groups=None, var_type=["invalid_type"] * X.shape[1]) + cfi.fit(X) def test_incompatible_imputer(self, data_generator): """Test when incompatible imputer is provided""" @@ -545,24 +565,33 @@ def test_incompatible_imputer(self, data_generator): def test_invalid_groups_format(self, data_generator): """Test when groups are provided in invalid format""" X, y, _, _ = data_generator + invalid_groups = ["group1", "group2"] # Should be dictionary fitted_model = LinearRegression().fit(X, y) - cfi = CFI(estimator=fitted_model, method="predict") + cfi = CFI( + estimator=fitted_model, + method="predict", + features_groups=invalid_groups, + feature_types="auto", + ) - invalid_groups = ["group1", "group2"] # Should be dictionary - with pytest.raises(ValueError, match="groups needs to be a dictionary"): - cfi.fit(X, groups=invalid_groups, var_type="auto") + with pytest.raises( + ValueError, match="features_groups needs to be a dictionary" + ): + cfi.fit(X) def test_groups_warning(self, data_generator): """Test if a subgroup raise a warning""" X, y, _, _ = data_generator + subgroups = {"group1": [0, 1], "group2": [2, 3]} fitted_model = LinearRegression().fit(X, y) cfi = CFI( estimator=fitted_model, imputation_model_continuous=LinearRegression(), method="predict", + features_groups=subgroups, + feature_types="auto", ) - subgroups = {"group1": [0, 1], "group2": [2, 3]} - cfi.fit(X, y, groups=subgroups, var_type="auto") + cfi.fit(X, y) with pytest.warns( UserWarning, @@ -588,9 +617,10 @@ def test_cfi_plot(data_generator): cfi = CFI( estimator=fitted_model, imputation_model_continuous=LinearRegression(), + feature_types="continuous", random_state=0, ) - cfi.fit(X_train, y_train, var_type="continuous") + cfi.fit(X_train, y_train) # Make the plot independent of data / randomness to test only the plotting function cfi.importances_ = np.arange(X.shape[1]) fig, ax = plt.subplots(figsize=(6, 3)) @@ -614,9 +644,10 @@ def test_cfi_plot_2d_imp(data_generator): cfi = CFI( estimator=fitted_model, imputation_model_continuous=LinearRegression(), + feature_types="continuous", random_state=0, ) - cfi.fit(X_train, y_train, var_type="continuous") + cfi.fit(X_train, y_train) # Make the plot independent of data / randomness to test only the plotting function cfi.importances_ = np.stack( [ @@ -645,9 +676,10 @@ def test_cfi_plot_coverage(data_generator): cfi = CFI( estimator=fitted_model, imputation_model_continuous=LinearRegression(), + feature_types="continuous", random_state=0, ) - cfi.fit(X_train, y_train, var_type="continuous") + cfi.fit(X_train, y_train) # Make the plot independent of data / randomness to test only the plotting function cfi.importances_ = np.arange(X.shape[1]) _, ax = plt.subplots(figsize=(6, 3)) diff --git a/test/test_leave_one_covariate_out.py b/test/test_leave_one_covariate_out.py index 10fee5370..ea3738634 100644 --- a/test/test_leave_one_covariate_out.py +++ b/test/test_leave_one_covariate_out.py @@ -31,13 +31,13 @@ def test_loco(): loco = LOCO( estimator=regression_model, method="predict", + features_groups=None, n_jobs=1, ) loco.fit( X_train, y_train, - groups=None, ) vim = loco.importance(X_test, y_test) @@ -59,12 +59,12 @@ def test_loco(): loco = LOCO( estimator=regression_model, method="predict", + features_groups=groups, n_jobs=1, ) loco.fit( X_train_df, y_train, - groups=groups, ) # warnings because we doesn't consider the name of columns of pandas with pytest.warns(UserWarning, match="X does not have valid feature names, but"): @@ -82,13 +82,16 @@ def test_loco(): loco_clf = LOCO( estimator=logistic_model, method="predict_proba", - n_jobs=1, + features_groups={ + "group_0": important_features, + "the_group_1": non_important_features, + }, loss=log_loss, + n_jobs=1, ) loco_clf.fit( X_train, y_train_clf, - groups={"group_0": important_features, "the_group_1": non_important_features}, ) vim_clf = loco_clf.importance(X_test, y_test_clf) diff --git a/test/test_permutation_feature_importance.py b/test/test_permutation_feature_importance.py index af37eb00c..0e7ee0c0d 100644 --- a/test/test_permutation_feature_importance.py +++ b/test/test_permutation_feature_importance.py @@ -30,6 +30,7 @@ def test_permutation_importance(): estimator=regression_model, n_permutations=20, method="predict", + features_groups=None, random_state=0, n_jobs=1, ) @@ -37,7 +38,6 @@ def test_permutation_importance(): pfi.fit( X_train, y_train, - groups=None, ) vim = pfi.importance(X_test, y_test) @@ -60,13 +60,13 @@ def test_permutation_importance(): estimator=regression_model, n_permutations=20, method="predict", + features_groups=groups, random_state=0, n_jobs=1, ) pfi.fit( X_train_df, y_train, - groups=groups, ) # warnings because we doesn't consider the name of columns of pandas with pytest.warns(UserWarning, match="X does not have valid feature names, but"): @@ -85,15 +85,15 @@ def test_permutation_importance(): estimator=logistic_model, n_permutations=20, method="predict_proba", + loss=log_loss, + features_groups=None, random_state=0, n_jobs=1, - loss=log_loss, ) pfi_clf.fit( X_train, y_train_clf, - groups=None, ) vim_clf = pfi_clf.importance(X_test, y_test_clf) @@ -136,7 +136,7 @@ def test_pfi_repeatability(pfi_test_data): """ X_train, X_test, y_train, y_test, pfi_default_parameters = pfi_test_data pfi = PFI(**pfi_default_parameters, random_state=0) - pfi.fit(X_train, y_train, groups=None) + pfi.fit(X_train, y_train) vim = pfi.importance(X_test, y_test)["importance"] vim_reproducible = pfi.importance(X_test, y_test)["importance"] assert np.array_equal(vim, vim_reproducible) @@ -149,16 +149,16 @@ def test_pfi_randomness_with_none(pfi_test_data): """ X_train, X_test, y_train, y_test, pfi_default_parameters = pfi_test_data pfi_fixed = PFI(**pfi_default_parameters, random_state=0) - pfi_fixed.fit(X_train, y_train, groups=None) + pfi_fixed.fit(X_train, y_train) vim_fixed = pfi_fixed.importance(X_test, y_test)["importance"] pfi_new_state = PFI(**pfi_default_parameters, random_state=1) - pfi_new_state.fit(X_train, y_train, groups=None) + pfi_new_state.fit(X_train, y_train) vim_new_state = pfi_new_state.importance(X_test, y_test)["importance"] assert not np.array_equal(vim_fixed, vim_new_state) pfi_none_state = PFI(**pfi_default_parameters, random_state=None) - pfi_none_state.fit(X_train, y_train, groups=None) + pfi_none_state.fit(X_train, y_train) vim_none_state_1 = pfi_none_state.importance(X_test, y_test)["importance"] vim_none_state_2 = pfi_none_state.importance(X_test, y_test)["importance"] assert not np.array_equal(vim_none_state_1, vim_none_state_2) @@ -171,11 +171,11 @@ def test_pfi_reproducibility_with_integer(pfi_test_data): """ X_train, X_test, y_train, y_test, pfi_default_parameters = pfi_test_data pfi_1 = PFI(**pfi_default_parameters, random_state=0) - pfi_1.fit(X_train, y_train, groups=None) + pfi_1.fit(X_train, y_train) vim_1 = pfi_1.importance(X_test, y_test)["importance"] pfi_2 = PFI(**pfi_default_parameters, random_state=0) - pfi_2.fit(X_train, y_train, groups=None) + pfi_2.fit(X_train, y_train) vim_2 = pfi_2.importance(X_test, y_test)["importance"] assert np.array_equal(vim_1, vim_2) @@ -189,7 +189,7 @@ def test_pfi_reproducibility_with_rng(pfi_test_data): X_train, X_test, y_train, y_test, pfi_default_parameters = pfi_test_data rng = np.random.default_rng(0) pfi = PFI(**pfi_default_parameters, random_state=rng) - pfi.fit(X_train, y_train, groups=None) + pfi.fit(X_train, y_train) vim = pfi.importance(X_test, y_test)["importance"] vim_repeat = pfi.importance(X_test, y_test)["importance"] assert not np.array_equal(vim, vim_repeat) @@ -197,6 +197,6 @@ def test_pfi_reproducibility_with_rng(pfi_test_data): # Refit with same rng rng = np.random.default_rng(0) pfi_reproducibility = PFI(**pfi_default_parameters, random_state=rng) - pfi_reproducibility.fit(X_train, y_train, groups=None) + pfi_reproducibility.fit(X_train, y_train) vim_reproducibility = pfi_reproducibility.importance(X_test, y_test)["importance"] assert np.array_equal(vim, vim_reproducibility) diff --git a/tools/documentation_developer/CI_documentation.rst b/tools/documentation_developer/CI_documentation.rst index 5e021c0f0..b423e0ba0 100644 --- a/tools/documentation_developer/CI_documentation.rst +++ b/tools/documentation_developer/CI_documentation.rst @@ -47,4 +47,4 @@ to automatically trigger the test of the CI. You can also trigger the workflow using with the [HTTP POST request](https://docs.github.com/en/actions/reference/workflows-and-actions/events-that-trigger-workflows#repository_dispatch). Once this modification is merged into main, it should be important to clean ci_test.yml -for having an empty workflow. \ No newline at end of file +for having an empty workflow. diff --git a/tools/documentation_developer/building_documentation.rst b/tools/documentation_developer/building_documentation.rst index 6fd57f2f1..a94f02ef8 100644 --- a/tools/documentation_developer/building_documentation.rst +++ b/tools/documentation_developer/building_documentation.rst @@ -55,4 +55,4 @@ Debugging the documentation 2. Example error with sphinx gallery: If there is an error which appears when the documentation is generated and not when the example is run alone, In this case, you should use the file - `documentation/debbugger_script/run_debug_example.py` \ No newline at end of file + `documentation/debbugger_script/run_debug_example.py` diff --git a/tools/documentation_developer/how_to_contribute.rst b/tools/documentation_developer/how_to_contribute.rst index 2816ffe9b..4054befa7 100644 --- a/tools/documentation_developer/how_to_contribute.rst +++ b/tools/documentation_developer/how_to_contribute.rst @@ -3,4 +3,4 @@ How to contribute to HiDimStat? ------------------------------- -.. image:: https://www.andrewelhabr.com/posts/expo.png \ No newline at end of file +.. image:: https://www.andrewelhabr.com/posts/expo.png