From 717330b2887c994f38d7bc6d1f081148ce15537a Mon Sep 17 00:00:00 2001
From: kusch lionel <lionel.a.kusch@inria.fr>
Date: Fri, 13 Dec 2024 14:28:19 +0100
Subject: [PATCH 01/19] Create a folder with the estimators

---
 src/hidimstat/__init__.py                             | 2 --
 src/hidimstat/{ => estimator}/Dnn_learner.py          | 0
 src/hidimstat/{ => estimator}/Dnn_learner_single.py   | 0
 src/hidimstat/{ => estimator}/RandomForestModified.py | 0
 src/hidimstat/estimator/__init__.py                   | 5 +++++
 5 files changed, 5 insertions(+), 2 deletions(-)
 rename src/hidimstat/{ => estimator}/Dnn_learner.py (100%)
 rename src/hidimstat/{ => estimator}/Dnn_learner_single.py (100%)
 rename src/hidimstat/{ => estimator}/RandomForestModified.py (100%)
 create mode 100644 src/hidimstat/estimator/__init__.py

diff --git a/src/hidimstat/__init__.py b/src/hidimstat/__init__.py
index f8114696a..4a8cdcb33 100644
--- a/src/hidimstat/__init__.py
+++ b/src/hidimstat/__init__.py
@@ -1,6 +1,5 @@
 from .clustered_inference import clustered_inference, hd_inference
 from .desparsified_lasso import desparsified_group_lasso, desparsified_lasso
-from .Dnn_learner_single import DnnLearnerSingle
 from .ensemble_clustered_inference import ensemble_clustered_inference
 from .knockoff_aggregation import knockoff_aggregation
 from .knockoffs import model_x_knockoff
@@ -25,7 +24,6 @@
     "dcrt_zero",
     "desparsified_lasso",
     "desparsified_group_lasso",
-    "DnnLearnerSingle",
     "ensemble_clustered_inference",
     "group_reid",
     "hd_inference",
diff --git a/src/hidimstat/Dnn_learner.py b/src/hidimstat/estimator/Dnn_learner.py
similarity index 100%
rename from src/hidimstat/Dnn_learner.py
rename to src/hidimstat/estimator/Dnn_learner.py
diff --git a/src/hidimstat/Dnn_learner_single.py b/src/hidimstat/estimator/Dnn_learner_single.py
similarity index 100%
rename from src/hidimstat/Dnn_learner_single.py
rename to src/hidimstat/estimator/Dnn_learner_single.py
diff --git a/src/hidimstat/RandomForestModified.py b/src/hidimstat/estimator/RandomForestModified.py
similarity index 100%
rename from src/hidimstat/RandomForestModified.py
rename to src/hidimstat/estimator/RandomForestModified.py
diff --git a/src/hidimstat/estimator/__init__.py b/src/hidimstat/estimator/__init__.py
new file mode 100644
index 000000000..afbfacd38
--- /dev/null
+++ b/src/hidimstat/estimator/__init__.py
@@ -0,0 +1,5 @@
+from .Dnn_learner_single import DnnLearnerSingle
+
+__all__ = [
+    "DnnLearnerSingle",
+]

From 99e1092722f1d0aa64d9c54af71eb452e7e61682 Mon Sep 17 00:00:00 2001
From: kusch lionel <lionel.a.kusch@inria.fr>
Date: Fri, 13 Dec 2024 14:29:42 +0100
Subject: [PATCH 02/19] Transfer of function from global utils to local utils

---
 hidimstat/estimator/_utils/u_Dnn_learner.py | 657 ++++++++++++++++++++
 src/hidimstat/utils.py                      | 610 +-----------------
 2 files changed, 658 insertions(+), 609 deletions(-)
 create mode 100644 hidimstat/estimator/_utils/u_Dnn_learner.py

diff --git a/hidimstat/estimator/_utils/u_Dnn_learner.py b/hidimstat/estimator/_utils/u_Dnn_learner.py
new file mode 100644
index 000000000..150e3f262
--- /dev/null
+++ b/hidimstat/estimator/_utils/u_Dnn_learner.py
@@ -0,0 +1,657 @@
+
+import numpy as np
+import copy
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchmetrics import Accuracy
+from sklearn.metrics import log_loss, mean_squared_error
+from sklearn.preprocessing import StandardScaler
+
+
+
+def create_X_y(
+    X,
+    y,
+    sampling_with_repetition=True,
+    split_percentage=0.8,
+    problem_type="regression",
+    list_continuous=None,
+    random_state=None,
+):
+    """
+    Create train/valid split of input data X and target variable y
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        The input samples before the splitting process.
+    y : ndarray, shape (n_samples, )
+        The output samples before the splitting process.
+    sampling_with_repetition : bool, default=True
+        Sampling with repetition the train part of the train/valid scheme under
+        the training set. The number of training samples in train is equal to
+        the number of instances in the training set.
+    split_percentage : float, default=0.8
+        The training/validation cut for the provided data.
+    problem_type : str, default='regression'
+        A classification or a regression problem.
+    list_continuous : list, default=[]
+        The list of continuous variables.
+    random_state : int, default=2023
+        Fixing the seeds of the random generator.
+
+    Returns
+    -------
+    X_train_scaled : {array-like, sparse matrix} of shape (n_train_samples, n_features)
+        The sampling_with_repetitionped training input samples with scaled continuous variables.
+    y_train_scaled : {array-like} of shape (n_train_samples, )
+        The sampling_with_repetitionped training output samples scaled if continous.
+    X_validation_scaled : {array-like, sparse matrix} of shape (n_validation_samples, n_features)
+        The validation input samples with scaled continuous variables.
+    y_validation_scaled : {array-like} of shape (n_validation_samples, )
+        The validation output samples scaled if continous.
+    X_scaled : {array-like, sparse matrix} of shape (n_samples, n_features)
+        The original input samples with scaled continuous variables.
+    y_validation : {array-like} of shape (n_samples, )
+        The original output samples with validation indices.
+    scaler_x : Scikit-learn StandardScaler
+        The standard scaler encoder for the continuous variables of the input.
+    scaler_y : Scikit-learn StandardScaler
+        The standard scaler encoder for the output if continuous.
+    valid_ind : list
+        The list of indices of the validation set.
+    """
+    rng = np.random.RandomState(random_state)
+    scaler_x, scaler_y = StandardScaler(), StandardScaler()
+    n = X.shape[0]
+
+    if sampling_with_repetition:
+        train_ind = rng.choice(n, n, replace=True)
+    else:
+        train_ind = rng.choice(
+            n, size=int(np.floor(split_percentage * n)), replace=False
+        )
+    valid_ind = np.array([ind for ind in range(n) if ind not in train_ind])
+
+    X_train, X_validation = X[train_ind], X[valid_ind]
+    y_train, y_validation = y[train_ind], y[valid_ind]
+
+    # Scaling X and y
+    X_train_scaled = X_train.copy()
+    X_validation_scaled = X_validation.copy()
+    X_scaled = X.copy()
+
+    if len(list_continuous) > 0:
+        X_train_scaled[:, list_continuous] = scaler_x.fit_transform(
+            X_train[:, list_continuous]
+        )
+        X_validation_scaled[:, list_continuous] = scaler_x.transform(
+            X_validation[:, list_continuous]
+        )
+        X_scaled[:, list_continuous] = scaler_x.transform(X[:, list_continuous])
+    if problem_type == "regression":
+        y_train_scaled = scaler_y.fit_transform(y_train)
+        y_validation_scaled = scaler_y.transform(y_validation)
+    else:
+        y_train_scaled = y_train.copy()
+        y_validation_scaled = y_validation.copy()
+
+    return (
+        X_train_scaled,
+        y_train_scaled,
+        X_validation_scaled,
+        y_validation_scaled,
+        X_scaled,
+        y_validation,
+        scaler_x,
+        scaler_y,
+        valid_ind,
+    )
+
+
+def sigmoid(x):
+    """
+    This function applies the sigmoid function element-wise to the input array x
+    """
+    return 1 / (1 + np.exp(-x))
+
+
+def softmax(x):
+    """
+    This function applies the softmax function element-wise to the input array x
+    """
+    # Ensure numerical stability by subtracting the maximum value of x from each element of x
+    # This prevents overflow errors when exponentiating large numbers
+    x = x - np.max(x, axis=-1, keepdims=True)
+    exp_x = np.exp(x)
+    return exp_x / np.sum(exp_x, axis=-1, keepdims=True)
+
+
+def relu(x):
+    """
+    This function applies the relu function element-wise to the input array x
+    """
+    return (abs(x) + x) / 2
+
+
+def ordinal_encode(y):
+    """
+    This function encodes the ordinal variable with a special gradual encoding storing also
+    the natural order information.
+    """
+    list_y = []
+    for y_col in range(y.shape[-1]):
+        # Retrieve the unique values
+        unique_vals = np.unique(y[:, y_col])
+        # Mapping each unique value to its corresponding index
+        mapping_dict = {}
+        for i, val in enumerate(unique_vals):
+            mapping_dict[val] = i + 1
+        # create a zero-filled array for the ordinal encoding
+        y_ordinal = np.zeros((len(y[:, y_col]), len(set(y[:, y_col]))))
+        # set the appropriate indices to 1 for each ordinal value and all lower ordinal values
+        for ind_el, el in enumerate(y[:, y_col]):
+            y_ordinal[ind_el, np.arange(mapping_dict[el])] = 1
+        list_y.append(y_ordinal[:, 1:])
+
+    return list_y
+
+
+def joblib_ensemble_dnnet(
+    X,
+    y,
+    problem_type="regression",
+    activation_outcome=None,
+    list_continuous=None,
+    list_grps=None,
+    sampling_with_repetition=False,
+    split_percentage=0.8,
+    group_stacking=False,
+    input_dimensions=None,
+    n_epoch=200,
+    batch_size=32,
+    beta1=0.9,
+    beta2=0.999,
+    lr=1e-3,
+    l1_weight=1e-2,
+    l2_weight=1e-2,
+    epsilon=1e-8,
+    random_state=None,
+):
+    """
+    This function implements the ensemble learning of the sub-DNN models
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_train_samples, n_features)
+        The input samples.
+    y : array-like of shape (n_train_samples,) or (n_train_samples, n_outputs)
+        The target values (class labels in classification, real numbers in
+        regression).
+    problem_type : str, default='regression'
+        A classification or a regression problem.
+    activation_outcome : str, default=None
+        The activation function to apply in the outcome layer, "softmax" for
+        classification and "sigmoid" for both ordinal and binary cases.
+    list_continuous : list, default=None
+        The list of continuous variables.
+    list_grps : list of lists, default=None
+        A list collecting the indices of the groups' variables
+        while applying the stacking method.
+    sampling_with_repetition : bool, default=True
+        Application of sampling_with_repetition sampling for the training set.
+    split_percentage : float, default=0.8
+        The training/validation cut for the provided data.
+    group_stacking : bool, default=False
+        Apply the stacking-based method for the provided groups.
+    input_dimensions : list, default=None
+        The cumsum of inputs after the linear sub-layers.
+    n_epoch : int, default=200
+        The number of epochs for the DNN learner(s).
+    batch_size : int, default=32
+        The number of samples per batch for training.
+    beta1 : float, default=0.9
+        The exponential decay rate for the first moment estimates.
+    beta2 : float, default=0.999
+        The exponential decay rate for the second moment estimates.
+    lr : float, default=1e-3
+        The learning rate.
+    l1_weight : float, default=1e-2
+        The L1-regularization paramter for weight decay.
+    l2_weight : float, default=0
+        The L2-regularization paramter for weight decay.
+    epsilon : float, default=1e-8
+        A small constant added to the denominator to prevent division by zero.
+    random_state : int, default=2023
+        Fixing the seeds of the random generator.
+
+    Returns
+    -------
+    current_model : list
+        The parameters of the sub-DNN model
+    scaler_x : list of Scikit-learn StandardScalers
+        The scalers for the continuous input variables.
+    scaler_y : Scikit-learn StandardScaler
+        The scaler for the continuous output variable.
+    pred_v : ndarray
+        The predictions of the sub-DNN model.
+    loss : float
+        The loss score of the sub-DNN model.
+    """
+
+    pred_v = np.empty(X.shape[0])
+    # Sampling and Train/Validate splitting
+    (
+        X_train_scaled,
+        y_train_scaled,
+        X_validation_scaled,
+        y_validation_scaled,
+        X_scaled,
+        y_validation,
+        scaler_x,
+        scaler_y,
+        valid_ind,
+    ) = create_X_y(
+        X,
+        y,
+        sampling_with_repetition=sampling_with_repetition,
+        split_percentage=split_percentage,
+        problem_type=problem_type,
+        list_continuous=list_continuous,
+        random_state=random_state,
+    )
+
+    current_model = dnn_net(
+        X_train_scaled,
+        y_train_scaled,
+        X_validation_scaled,
+        y_validation_scaled,
+        problem_type=problem_type,
+        n_epoch=n_epoch,
+        batch_size=batch_size,
+        beta1=beta1,
+        beta2=beta2,
+        lr=lr,
+        l1_weight=l1_weight,
+        l2_weight=l2_weight,
+        epsilon=epsilon,
+        list_grps=list_grps,
+        group_stacking=group_stacking,
+        input_dimensions=input_dimensions,
+        random_state=random_state,
+    )
+
+    if not group_stacking:
+        X_scaled_n = X_scaled.copy()
+    else:
+        X_scaled_n = np.zeros((X_scaled.shape[0], input_dimensions[-1]))
+        for grp_ind in range(len(list_grps)):
+            n_layer_stacking = len(current_model[3][grp_ind]) - 1
+            curr_pred = X_scaled[:, list_grps[grp_ind]].copy()
+            for ind_w_b in range(n_layer_stacking):
+                if ind_w_b == 0:
+                    curr_pred = relu(
+                        X_scaled[:, list_grps[grp_ind]].dot(
+                            current_model[3][grp_ind][ind_w_b]
+                        )
+                        + current_model[4][grp_ind][ind_w_b]
+                    )
+                else:
+                    curr_pred = relu(
+                        curr_pred.dot(current_model[3][grp_ind][ind_w_b])
+                        + current_model[4][grp_ind][ind_w_b]
+                    )
+            X_scaled_n[
+                :,
+                list(
+                    np.arange(input_dimensions[grp_ind], input_dimensions[grp_ind + 1])
+                ),
+            ] = (
+                curr_pred.dot(current_model[3][grp_ind][n_layer_stacking])
+                + current_model[4][grp_ind][n_layer_stacking]
+            )
+
+    n_layer = len(current_model[0]) - 1
+    for j in range(n_layer):
+        if j == 0:
+            pred = relu(X_scaled_n.dot(current_model[0][j]) + current_model[1][j])
+        else:
+            pred = relu(pred.dot(current_model[0][j]) + current_model[1][j])
+
+    pred = pred.dot(current_model[0][n_layer]) + current_model[1][n_layer]
+
+    if problem_type not in ("classification", "binary"):
+        if problem_type != "ordinal":
+            pred_v = pred * scaler_y.scale_ + scaler_y.mean_
+        else:
+            pred_v = activation_outcome[problem_type](pred)
+        loss = np.std(y_validation) ** 2 - mean_squared_error(
+            y_validation, pred_v[valid_ind]
+        )
+    else:
+        pred_v = activation_outcome[problem_type](pred)
+        loss = log_loss(
+            y_validation, np.ones(y_validation.shape) * np.mean(y_validation, axis=0)
+        ) - log_loss(y_validation, pred_v[valid_ind])
+
+    return (current_model, scaler_x, scaler_y, pred_v, loss)
+
+
+def _initialize_weights(layer):
+    if isinstance(layer, nn.Linear):
+        layer.weight.data = (layer.weight.data.uniform_() - 0.5) * 0.2
+        layer.bias.data = (layer.bias.data.uniform_() - 0.5) * 0.1
+
+
+def _dataset_Loader(X, y, shuffle=False, batch_size=50):
+    if y.shape[-1] == 2:
+        y = y[:, [1]]
+    dataset = torch.utils.data.TensorDataset(
+        torch.from_numpy(X).float(), torch.from_numpy(y).float()
+    )
+
+    loader = torch.utils.data.DataLoader(
+        dataset, batch_size=batch_size, shuffle=shuffle
+    )
+    return loader
+
+
+class DNN(nn.Module):
+    """
+    Feedfoward Neural Network with 4 hidden layers
+    """
+
+    def __init__(
+        self, input_dim, group_stacking, list_grps, output_dimension, problem_type
+    ):
+        super().__init__()
+        if problem_type == "classification":
+            self.accuracy = Accuracy(task="multiclass", num_classes=output_dimension)
+        else:
+            self.accuracy = Accuracy(task="binary")
+        self.list_grps = list_grps
+        self.group_stacking = group_stacking
+        if group_stacking:
+            self.layers_stacking = nn.ModuleList(
+                [
+                    nn.Linear(
+                        in_features=len(grp),
+                        out_features=input_dim[grp_ind + 1] - input_dim[grp_ind],
+                    )
+                    # nn.Sequential(
+                    #     nn.Linear(
+                    #         in_features=len(grp),
+                    #         # out_features=max(1, int(0.1 * len(grp))),
+                    #         out_features=input_dim[grp_ind + 1]
+                    #         - input_dim[grp_ind],
+                    #     ),
+                    #     nn.ReLU(),
+                    # nn.Linear(
+                    #     in_features=max(1, int(0.1 * len(grp))),
+                    #     out_features=input_dim[grp_ind + 1]
+                    #     - input_dim[grp_ind],
+                    # ),
+                    # nn.ReLU(),
+                    # nn.Linear(
+                    #     in_features=max(1, int(0.1 * len(grp))),
+                    #     out_features=input_dim[grp_ind + 1]
+                    #     - input_dim[grp_ind],
+                    # ),
+                    # )
+                    for grp_ind, grp in enumerate(list_grps)
+                ]
+            )
+            input_dim = input_dim[-1]
+        self.layers = nn.Sequential(
+            # hidden layers
+            nn.Linear(input_dim, 50),
+            nn.ReLU(),
+            nn.Linear(50, 40),
+            nn.ReLU(),
+            nn.Linear(40, 30),
+            nn.ReLU(),
+            nn.Linear(30, 20),
+            nn.ReLU(),
+            # output layer
+            nn.Linear(20, output_dimension),
+        )
+        self.loss = 0
+
+    def forward(self, x):
+        if self.group_stacking:
+            list_stacking = [None] * len(self.layers_stacking)
+            for ind_layer, layer in enumerate(self.layers_stacking):
+                list_stacking[ind_layer] = layer(x[:, self.list_grps[ind_layer]])
+            x = torch.cat(list_stacking, dim=1)
+        return self.layers(x)
+
+    def training_step(self, batch, device, problem_type):
+        X, y = batch[0].to(device), batch[1].to(device)
+        y_pred = self(X)  # Generate predictions
+        if problem_type == "regression":
+            loss = F.mse_loss(y_pred, y)
+        elif problem_type == "classification":
+            loss = F.cross_entropy(y_pred, y)  # Calculate loss
+        else:
+            loss = F.binary_cross_entropy_with_logits(y_pred, y)
+        return loss
+
+    def validation_step(self, batch, device, problem_type):
+        X, y = batch[0].to(device), batch[1].to(device)
+        y_pred = self(X)  # Generate predictions
+        if problem_type == "regression":
+            loss = F.mse_loss(y_pred, y)
+            return {
+                "val_mse": loss,
+                "batch_size": len(X),
+            }
+        else:
+            if problem_type == "classification":
+                loss = F.cross_entropy(y_pred, y)  # Calculate loss
+            else:
+                loss = F.binary_cross_entropy_with_logits(y_pred, y)
+            acc = self.accuracy(y_pred, y.int())
+            return {
+                "val_loss": loss,
+                "val_acc": acc,
+                "batch_size": len(X),
+            }
+
+    def validation_epoch_end(self, outputs, problem_type):
+        if problem_type in ("classification", "binary", "ordinal"):
+            batch_losses = []
+            batch_accs = []
+            batch_sizes = []
+            for x in outputs:
+                batch_losses.append(x["val_loss"] * x["batch_size"])
+                batch_accs.append(x["val_acc"] * x["batch_size"])
+                batch_sizes.append(x["batch_size"])
+            self.loss = torch.stack(batch_losses).sum().item() / np.sum(
+                batch_sizes
+            )  # Combine losses
+            epoch_acc = torch.stack(batch_accs).sum().item() / np.sum(
+                batch_sizes
+            )  # Combine accuracies
+            return {"val_loss": self.loss, "val_acc": epoch_acc}
+        else:
+            batch_losses = [x["val_mse"] * x["batch_size"] for x in outputs]
+            batch_sizes = [x["batch_size"] for x in outputs]
+            self.loss = torch.stack(batch_losses).sum().item() / np.sum(
+                batch_sizes
+            )  # Combine losses
+            return {"val_mse": self.loss}
+
+    def epoch_end(self, epoch, result):
+        if len(result) == 2:
+            print(
+                "Epoch [{}], val_loss: {:.4f}, val_acc: {:.4f}".format(
+                    epoch + 1, result["val_loss"], result["val_acc"]
+                )
+            )
+        else:
+            print("Epoch [{}], val_mse: {:.4f}".format(epoch + 1, result["val_mse"]))
+
+
+def _evaluate(model, loader, device, problem_type):
+    outputs = [model.validation_step(batch, device, problem_type) for batch in loader]
+    return model.validation_epoch_end(outputs, problem_type)
+
+
+def dnn_net(
+    X_train,
+    y_train,
+    X_validation,
+    y_validation,
+    problem_type="regression",
+    n_epoch=200,
+    batch_size=32,
+    batch_size_validation=128,
+    beta1=0.9,
+    beta2=0.999,
+    lr=1e-3,
+    l1_weight=1e-2,
+    l2_weight=1e-2,
+    epsilon=1e-8,
+    list_grps=None,
+    group_stacking=False,
+    input_dimensions=None,
+    random_state=2023,
+    verbose=0,
+):
+    """
+    This function implements the training/validation process of the sub-DNN
+    models
+
+    Parameters
+    ----------
+    X_train : {array-like, sparse matrix} of shape (n_train_samples, n_features)
+        The training input samples.
+    y_train : {array-like} of shape (n_train_samples, )
+        The training output samples.
+    X_validation : {array-like, sparse matrix} of shape (n_validation_samples, n_features)
+        The validation input samples.
+    y_validation : {array-like} of shape (n_validation_samples, )
+        The validation output samples.
+    problem_type : str, default='regression'
+        A classification or a regression problem.
+    n_epoch : int, default=200
+        The number of epochs for the DNN learner(s).
+    batch_size : int, default=32
+        The number of samples per batch for training.
+    batch_size_validation : int, default=128
+        The number of samples per batch for validation.
+    beta1 : float, default=0.9
+        The exponential decay rate for the first moment estimates.
+    beta2 : float, default=0.999
+        The exponential decay rate for the second moment estimates.
+    lr : float, default=1e-3
+        The learning rate.
+    l1_weight : float, default=1e-2
+        The L1-regularization paramter for weight decay.
+    l2_weight : float, default=0
+        The L2-regularization paramter for weight decay.
+    epsilon : float, default=1e-8
+        A small constant added to the denominator to prevent division by zero.
+    list_grps : list of lists, default=None
+        A list collecting the indices of the groups' variables
+        while applying the stacking method.
+    group_stacking : bool, default=False
+        Apply the stacking-based method for the provided groups.
+    input_dimensions : list, default=None
+        The cumsum of inputs after the linear sub-layers.
+    random_state : int, default=2023
+        Fixing the seeds of the random generator.
+    verbose : int, default=0
+        If verbose > 0, the fitted iterations will be printed.
+    """
+    # Creating DataLoaders
+    train_loader = _dataset_Loader(
+        X_train,
+        y_train,
+        shuffle=True,
+        batch_size=batch_size,
+    )
+    validate_loader = _dataset_Loader(
+        X_validation, y_validation, batch_size=batch_size_validation
+    )
+    # Set the seed for PyTorch's random number generator
+    torch.manual_seed(random_state)
+
+    # Set the seed for PyTorch's CUDA random number generator(s), if available
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(random_state)
+        torch.cuda.manual_seed_all(random_state)
+
+    # Specify whether to use GPU or CPU
+    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    device = torch.device("cpu")
+
+    if problem_type in ("regression", "binary"):
+        output_dimension = 1
+    else:
+        output_dimension = y_train.shape[-1]
+
+    # DNN model
+    input_dim = input_dimensions.copy() if group_stacking else X_train.shape[1]
+    model = DNN(input_dim, group_stacking, list_grps, output_dimension, problem_type)
+    model.to(device)
+    # Initializing weights/bias
+    model.apply(_initialize_weights)
+    # Adam Optimizer
+    optimizer = torch.optim.Adam(
+        model.parameters(), lr=lr, betas=(beta1, beta2), eps=epsilon
+    )
+
+    best_loss = 1e100
+    for epoch in range(n_epoch):
+        # Training Phase
+        model.train()
+        for batch in train_loader:
+            optimizer.zero_grad()
+            loss = model.training_step(batch, device, problem_type)
+
+            loss.backward()
+            optimizer.step()
+            for name, param in model.named_parameters():
+                if "bias" not in name:
+                    # if name.split(".")[0] == "layers_stacking":
+                    #     param.data -= l2_weight * param.data
+                    # else:
+                    param.data -= (
+                        l1_weight * torch.sign(param.data) + l2_weight * param.data
+                    )
+        # Validation Phase
+        model.eval()
+        result = _evaluate(model, validate_loader, device, problem_type)
+        if model.loss < best_loss:
+            best_loss = model.loss
+            dict_params = copy.deepcopy(model.state_dict())
+        if verbose >= 2:
+            model.epoch_end(epoch, result)
+
+    best_weight = []
+    best_bias = []
+    best_weight_stack = [[].copy() for _ in range(len(list_grps))]
+    best_bias_stack = [[].copy() for _ in range(len(list_grps))]
+
+    for name, param in dict_params.items():
+        if name.split(".")[0] == "layers":
+            if name.split(".")[-1] == "weight":
+                best_weight.append(param.numpy().T)
+            if name.split(".")[-1] == "bias":
+                best_bias.append(param.numpy()[np.newaxis, :])
+        if name.split(".")[0] == "layers_stacking":
+            curr_ind = int(name.split(".")[1])
+            if name.split(".")[-1] == "weight":
+                best_weight_stack[curr_ind].append(param.numpy().T)
+            if name.split(".")[-1] == "bias":
+                best_bias_stack[curr_ind].append(param.numpy()[np.newaxis, :])
+
+    return [
+        best_weight,
+        best_bias,
+        best_loss,
+        best_weight_stack,
+        best_bias_stack,
+    ]
\ No newline at end of file
diff --git a/src/hidimstat/utils.py b/src/hidimstat/utils.py
index 14f4d5ef3..2c112a000 100644
--- a/src/hidimstat/utils.py
+++ b/src/hidimstat/utils.py
@@ -1,11 +1,5 @@
 import copy
 import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from sklearn.metrics import log_loss, mean_squared_error
-from sklearn.preprocessing import StandardScaler
-from torchmetrics import Accuracy
 
 
 ########################## quantile aggregation method ##########################
@@ -446,606 +440,4 @@ def _check_vim_predict_method(method):
             "The method {} is not a valid method for variable importance measure prediction".format(
                 method
             )
-        )
-
-
-def sigmoid(x):
-    """
-    This function applies the sigmoid function element-wise to the input array x
-    """
-    return 1 / (1 + np.exp(-x))
-
-
-def softmax(x):
-    """
-    This function applies the softmax function element-wise to the input array x
-    """
-    # Ensure numerical stability by subtracting the maximum value of x from each element of x
-    # This prevents overflow errors when exponentiating large numbers
-    x = x - np.max(x, axis=-1, keepdims=True)
-    exp_x = np.exp(x)
-    return exp_x / np.sum(exp_x, axis=-1, keepdims=True)
-
-
-def relu(x):
-    """
-    This function applies the relu function element-wise to the input array x
-    """
-    return (abs(x) + x) / 2
-
-
-def relu_(x):
-    """
-    This function applies the derivative of the relu function element-wise
-    to the input array x
-    """
-    return (x > 0) * 1
-
-
-def convert_predict_proba(list_probs):
-    """
-    If the classification is done using a one-hot encoded variable, the list of
-    probabilites will be a list of lists for the probabilities of each of the categories.
-    This function takes the probabilities of having each category (=1 with binary) and stack
-    them into one ndarray.
-    """
-    if len(list_probs.shape) == 3:
-        list_probs = np.array(list_probs)[..., 1].T
-    return list_probs
-
-
-def ordinal_encode(y):
-    """
-    This function encodes the ordinal variable with a special gradual encoding storing also
-    the natural order information.
-    """
-    list_y = []
-    for y_col in range(y.shape[-1]):
-        # Retrieve the unique values
-        unique_vals = np.unique(y[:, y_col])
-        # Mapping each unique value to its corresponding index
-        mapping_dict = {}
-        for i, val in enumerate(unique_vals):
-            mapping_dict[val] = i + 1
-        # create a zero-filled array for the ordinal encoding
-        y_ordinal = np.zeros((len(y[:, y_col]), len(set(y[:, y_col]))))
-        # set the appropriate indices to 1 for each ordinal value and all lower ordinal values
-        for ind_el, el in enumerate(y[:, y_col]):
-            y_ordinal[ind_el, np.arange(mapping_dict[el])] = 1
-        list_y.append(y_ordinal[:, 1:])
-
-    return list_y
-
-
-def sample_predictions(predictions, random_state=None):
-    """
-    This function samples from the same leaf node of the input sample
-    in both the regression and the classification cases
-    """
-    rng = np.random.RandomState(random_state)
-    # print(predictions[..., rng.randint(predictions.shape[2]), :])
-    # print(predictions.shape)
-    # exit(0)
-    return predictions[..., rng.randint(predictions.shape[2]), :]
-
-
-def joblib_ensemble_dnnet(
-    X,
-    y,
-    problem_type="regression",
-    activation_outcome=None,
-    list_continuous=None,
-    list_grps=None,
-    sampling_with_repetition=False,
-    split_percentage=0.8,
-    group_stacking=False,
-    input_dimensions=None,
-    n_epoch=200,
-    batch_size=32,
-    beta1=0.9,
-    beta2=0.999,
-    lr=1e-3,
-    l1_weight=1e-2,
-    l2_weight=1e-2,
-    epsilon=1e-8,
-    random_state=None,
-):
-    """
-    This function implements the ensemble learning of the sub-DNN models
-
-    Parameters
-    ----------
-    X : {array-like, sparse matrix} of shape (n_train_samples, n_features)
-        The input samples.
-    y : array-like of shape (n_train_samples,) or (n_train_samples, n_outputs)
-        The target values (class labels in classification, real numbers in
-        regression).
-    problem_type : str, default='regression'
-        A classification or a regression problem.
-    activation_outcome : str, default=None
-        The activation function to apply in the outcome layer, "softmax" for
-        classification and "sigmoid" for both ordinal and binary cases.
-    list_continuous : list, default=None
-        The list of continuous variables.
-    list_grps : list of lists, default=None
-        A list collecting the indices of the groups' variables
-        while applying the stacking method.
-    sampling_with_repetition : bool, default=True
-        Application of sampling_with_repetition sampling for the training set.
-    split_percentage : float, default=0.8
-        The training/validation cut for the provided data.
-    group_stacking : bool, default=False
-        Apply the stacking-based method for the provided groups.
-    input_dimensions : list, default=None
-        The cumsum of inputs after the linear sub-layers.
-    n_epoch : int, default=200
-        The number of epochs for the DNN learner(s).
-    batch_size : int, default=32
-        The number of samples per batch for training.
-    beta1 : float, default=0.9
-        The exponential decay rate for the first moment estimates.
-    beta2 : float, default=0.999
-        The exponential decay rate for the second moment estimates.
-    lr : float, default=1e-3
-        The learning rate.
-    l1_weight : float, default=1e-2
-        The L1-regularization paramter for weight decay.
-    l2_weight : float, default=0
-        The L2-regularization paramter for weight decay.
-    epsilon : float, default=1e-8
-        A small constant added to the denominator to prevent division by zero.
-    random_state : int, default=2023
-        Fixing the seeds of the random generator.
-
-    Returns
-    -------
-    current_model : list
-        The parameters of the sub-DNN model
-    scaler_x : list of Scikit-learn StandardScalers
-        The scalers for the continuous input variables.
-    scaler_y : Scikit-learn StandardScaler
-        The scaler for the continuous output variable.
-    pred_v : ndarray
-        The predictions of the sub-DNN model.
-    loss : float
-        The loss score of the sub-DNN model.
-    """
-
-    pred_v = np.empty(X.shape[0])
-    # Sampling and Train/Validate splitting
-    (
-        X_train_scaled,
-        y_train_scaled,
-        X_validation_scaled,
-        y_validation_scaled,
-        X_scaled,
-        y_validation,
-        scaler_x,
-        scaler_y,
-        valid_ind,
-    ) = create_X_y(
-        X,
-        y,
-        sampling_with_repetition=sampling_with_repetition,
-        split_percentage=split_percentage,
-        problem_type=problem_type,
-        list_continuous=list_continuous,
-        random_state=random_state,
-    )
-
-    current_model = dnn_net(
-        X_train_scaled,
-        y_train_scaled,
-        X_validation_scaled,
-        y_validation_scaled,
-        problem_type=problem_type,
-        n_epoch=n_epoch,
-        batch_size=batch_size,
-        beta1=beta1,
-        beta2=beta2,
-        lr=lr,
-        l1_weight=l1_weight,
-        l2_weight=l2_weight,
-        epsilon=epsilon,
-        list_grps=list_grps,
-        group_stacking=group_stacking,
-        input_dimensions=input_dimensions,
-        random_state=random_state,
-    )
-
-    if not group_stacking:
-        X_scaled_n = X_scaled.copy()
-    else:
-        X_scaled_n = np.zeros((X_scaled.shape[0], input_dimensions[-1]))
-        for grp_ind in range(len(list_grps)):
-            n_layer_stacking = len(current_model[3][grp_ind]) - 1
-            curr_pred = X_scaled[:, list_grps[grp_ind]].copy()
-            for ind_w_b in range(n_layer_stacking):
-                if ind_w_b == 0:
-                    curr_pred = relu(
-                        X_scaled[:, list_grps[grp_ind]].dot(
-                            current_model[3][grp_ind][ind_w_b]
-                        )
-                        + current_model[4][grp_ind][ind_w_b]
-                    )
-                else:
-                    curr_pred = relu(
-                        curr_pred.dot(current_model[3][grp_ind][ind_w_b])
-                        + current_model[4][grp_ind][ind_w_b]
-                    )
-            X_scaled_n[
-                :,
-                list(
-                    np.arange(input_dimensions[grp_ind], input_dimensions[grp_ind + 1])
-                ),
-            ] = (
-                curr_pred.dot(current_model[3][grp_ind][n_layer_stacking])
-                + current_model[4][grp_ind][n_layer_stacking]
-            )
-
-    n_layer = len(current_model[0]) - 1
-    for j in range(n_layer):
-        if j == 0:
-            pred = relu(X_scaled_n.dot(current_model[0][j]) + current_model[1][j])
-        else:
-            pred = relu(pred.dot(current_model[0][j]) + current_model[1][j])
-
-    pred = pred.dot(current_model[0][n_layer]) + current_model[1][n_layer]
-
-    if problem_type not in ("classification", "binary"):
-        if problem_type != "ordinal":
-            pred_v = pred * scaler_y.scale_ + scaler_y.mean_
-        else:
-            pred_v = activation_outcome[problem_type](pred)
-        loss = np.std(y_validation) ** 2 - mean_squared_error(
-            y_validation, pred_v[valid_ind]
-        )
-    else:
-        pred_v = activation_outcome[problem_type](pred)
-        loss = log_loss(
-            y_validation, np.ones(y_validation.shape) * np.mean(y_validation, axis=0)
-        ) - log_loss(y_validation, pred_v[valid_ind])
-
-    return (current_model, scaler_x, scaler_y, pred_v, loss)
-
-
-def initialize_weights(layer):
-    if isinstance(layer, nn.Linear):
-        layer.weight.data = (layer.weight.data.uniform_() - 0.5) * 0.2
-        layer.bias.data = (layer.bias.data.uniform_() - 0.5) * 0.1
-
-
-def Dataset_Loader(X, y, shuffle=False, batch_size=50):
-    if y.shape[-1] == 2:
-        y = y[:, [1]]
-    dataset = torch.utils.data.TensorDataset(
-        torch.from_numpy(X).float(), torch.from_numpy(y).float()
-    )
-
-    loader = torch.utils.data.DataLoader(
-        dataset, batch_size=batch_size, shuffle=shuffle
-    )
-    return loader
-
-
-class DNN(nn.Module):
-    """
-    Feedfoward Neural Network with 4 hidden layers
-    """
-
-    def __init__(
-        self, input_dim, group_stacking, list_grps, output_dimension, problem_type
-    ):
-        super().__init__()
-        if problem_type == "classification":
-            self.accuracy = Accuracy(task="multiclass", num_classes=output_dimension)
-        else:
-            self.accuracy = Accuracy(task="binary")
-        self.list_grps = list_grps
-        self.group_stacking = group_stacking
-        if group_stacking:
-            self.layers_stacking = nn.ModuleList(
-                [
-                    nn.Linear(
-                        in_features=len(grp),
-                        out_features=input_dim[grp_ind + 1] - input_dim[grp_ind],
-                    )
-                    # nn.Sequential(
-                    #     nn.Linear(
-                    #         in_features=len(grp),
-                    #         # out_features=max(1, int(0.1 * len(grp))),
-                    #         out_features=input_dim[grp_ind + 1]
-                    #         - input_dim[grp_ind],
-                    #     ),
-                    #     nn.ReLU(),
-                    # nn.Linear(
-                    #     in_features=max(1, int(0.1 * len(grp))),
-                    #     out_features=input_dim[grp_ind + 1]
-                    #     - input_dim[grp_ind],
-                    # ),
-                    # nn.ReLU(),
-                    # nn.Linear(
-                    #     in_features=max(1, int(0.1 * len(grp))),
-                    #     out_features=input_dim[grp_ind + 1]
-                    #     - input_dim[grp_ind],
-                    # ),
-                    # )
-                    for grp_ind, grp in enumerate(list_grps)
-                ]
-            )
-            input_dim = input_dim[-1]
-        self.layers = nn.Sequential(
-            # hidden layers
-            nn.Linear(input_dim, 50),
-            nn.ReLU(),
-            nn.Linear(50, 40),
-            nn.ReLU(),
-            nn.Linear(40, 30),
-            nn.ReLU(),
-            nn.Linear(30, 20),
-            nn.ReLU(),
-            # output layer
-            nn.Linear(20, output_dimension),
-        )
-        self.loss = 0
-
-    def forward(self, x):
-        if self.group_stacking:
-            list_stacking = [None] * len(self.layers_stacking)
-            for ind_layer, layer in enumerate(self.layers_stacking):
-                list_stacking[ind_layer] = layer(x[:, self.list_grps[ind_layer]])
-            x = torch.cat(list_stacking, dim=1)
-        return self.layers(x)
-
-    def training_step(self, batch, device, problem_type):
-        X, y = batch[0].to(device), batch[1].to(device)
-        y_pred = self(X)  # Generate predictions
-        if problem_type == "regression":
-            loss = F.mse_loss(y_pred, y)
-        elif problem_type == "classification":
-            loss = F.cross_entropy(y_pred, y)  # Calculate loss
-        else:
-            loss = F.binary_cross_entropy_with_logits(y_pred, y)
-        return loss
-
-    def validation_step(self, batch, device, problem_type):
-        X, y = batch[0].to(device), batch[1].to(device)
-        y_pred = self(X)  # Generate predictions
-        if problem_type == "regression":
-            loss = F.mse_loss(y_pred, y)
-            return {
-                "val_mse": loss,
-                "batch_size": len(X),
-            }
-        else:
-            if problem_type == "classification":
-                loss = F.cross_entropy(y_pred, y)  # Calculate loss
-            else:
-                loss = F.binary_cross_entropy_with_logits(y_pred, y)
-            acc = self.accuracy(y_pred, y.int())
-            return {
-                "val_loss": loss,
-                "val_acc": acc,
-                "batch_size": len(X),
-            }
-
-    def validation_epoch_end(self, outputs, problem_type):
-        if problem_type in ("classification", "binary"):
-            batch_losses = []
-            batch_accs = []
-            batch_sizes = []
-            for x in outputs:
-                batch_losses.append(x["val_loss"] * x["batch_size"])
-                batch_accs.append(x["val_acc"] * x["batch_size"])
-                batch_sizes.append(x["batch_size"])
-            self.loss = torch.stack(batch_losses).sum().item() / np.sum(
-                batch_sizes
-            )  # Combine losses
-            epoch_acc = torch.stack(batch_accs).sum().item() / np.sum(
-                batch_sizes
-            )  # Combine accuracies
-            return {"val_loss": self.loss, "val_acc": epoch_acc}
-        else:
-            batch_losses = [x["val_mse"] * x["batch_size"] for x in outputs]
-            batch_sizes = [x["batch_size"] for x in outputs]
-            self.loss = torch.stack(batch_losses).sum().item() / np.sum(
-                batch_sizes
-            )  # Combine losses
-            return {"val_mse": self.loss}
-
-    def epoch_end(self, epoch, result):
-        if len(result) == 2:
-            print(
-                "Epoch [{}], val_loss: {:.4f}, val_acc: {:.4f}".format(
-                    epoch + 1, result["val_loss"], result["val_acc"]
-                )
-            )
-        else:
-            print("Epoch [{}], val_mse: {:.4f}".format(epoch + 1, result["val_mse"]))
-
-
-def evaluate(model, loader, device, problem_type):
-    outputs = [model.validation_step(batch, device, problem_type) for batch in loader]
-    return model.validation_epoch_end(outputs, problem_type)
-
-
-def dnn_net(
-    X_train,
-    y_train,
-    X_validation,
-    y_validation,
-    problem_type="regression",
-    n_epoch=200,
-    batch_size=32,
-    batch_size_validation=128,
-    beta1=0.9,
-    beta2=0.999,
-    lr=1e-3,
-    l1_weight=1e-2,
-    l2_weight=1e-2,
-    epsilon=1e-8,
-    list_grps=None,
-    group_stacking=False,
-    input_dimensions=None,
-    random_state=2023,
-    verbose=0,
-):
-    """
-    This function implements the training/validation process of the sub-DNN
-    models
-
-    Parameters
-    ----------
-    X_train : {array-like, sparse matrix} of shape (n_train_samples, n_features)
-        The training input samples.
-    y_train : {array-like} of shape (n_train_samples, )
-        The training output samples.
-    X_validation : {array-like, sparse matrix} of shape (n_validation_samples, n_features)
-        The validation input samples.
-    y_validation : {array-like} of shape (n_validation_samples, )
-        The validation output samples.
-    problem_type : str, default='regression'
-        A classification or a regression problem.
-    n_epoch : int, default=200
-        The number of epochs for the DNN learner(s).
-    batch_size : int, default=32
-        The number of samples per batch for training.
-    batch_size_validation : int, default=128
-        The number of samples per batch for validation.
-    beta1 : float, default=0.9
-        The exponential decay rate for the first moment estimates.
-    beta2 : float, default=0.999
-        The exponential decay rate for the second moment estimates.
-    lr : float, default=1e-3
-        The learning rate.
-    l1_weight : float, default=1e-2
-        The L1-regularization paramter for weight decay.
-    l2_weight : float, default=0
-        The L2-regularization paramter for weight decay.
-    epsilon : float, default=1e-8
-        A small constant added to the denominator to prevent division by zero.
-    list_grps : list of lists, default=None
-        A list collecting the indices of the groups' variables
-        while applying the stacking method.
-    group_stacking : bool, default=False
-        Apply the stacking-based method for the provided groups.
-    input_dimensions : list, default=None
-        The cumsum of inputs after the linear sub-layers.
-    random_state : int, default=2023
-        Fixing the seeds of the random generator.
-    verbose : int, default=0
-        If verbose > 0, the fitted iterations will be printed.
-    """
-    # Creating DataLoaders
-    train_loader = Dataset_Loader(
-        X_train,
-        y_train,
-        shuffle=True,
-        batch_size=batch_size,
-    )
-    validate_loader = Dataset_Loader(
-        X_validation, y_validation, batch_size=batch_size_validation
-    )
-    # Set the seed for PyTorch's random number generator
-    torch.manual_seed(random_state)
-
-    # Set the seed for PyTorch's CUDA random number generator(s), if available
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed(random_state)
-        torch.cuda.manual_seed_all(random_state)
-
-    # Specify whether to use GPU or CPU
-    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    device = torch.device("cpu")
-
-    if problem_type in ("regression", "binary"):
-        output_dimension = 1
-    else:
-        output_dimension = y_train.shape[-1]
-
-    # DNN model
-    input_dim = input_dimensions.copy() if group_stacking else X_train.shape[1]
-    model = DNN(input_dim, group_stacking, list_grps, output_dimension, problem_type)
-    model.to(device)
-    # Initializing weights/bias
-    model.apply(initialize_weights)
-    # Adam Optimizer
-    optimizer = torch.optim.Adam(
-        model.parameters(), lr=lr, betas=(beta1, beta2), eps=epsilon
-    )
-
-    best_loss = 1e100
-    for epoch in range(n_epoch):
-        # Training Phase
-        model.train()
-        for batch in train_loader:
-            optimizer.zero_grad()
-            loss = model.training_step(batch, device, problem_type)
-
-            loss.backward()
-            optimizer.step()
-            for name, param in model.named_parameters():
-                if "bias" not in name:
-                    # if name.split(".")[0] == "layers_stacking":
-                    #     param.data -= l2_weight * param.data
-                    # else:
-                    param.data -= (
-                        l1_weight * torch.sign(param.data) + l2_weight * param.data
-                    )
-        # Validation Phase
-        model.eval()
-        result = evaluate(model, validate_loader, device, problem_type)
-        if model.loss < best_loss:
-            best_loss = model.loss
-            dict_params = copy.deepcopy(model.state_dict())
-        if verbose >= 2:
-            model.epoch_end(epoch, result)
-
-    best_weight = []
-    best_bias = []
-    best_weight_stack = [[].copy() for _ in range(len(list_grps))]
-    best_bias_stack = [[].copy() for _ in range(len(list_grps))]
-
-    for name, param in dict_params.items():
-        if name.split(".")[0] == "layers":
-            if name.split(".")[-1] == "weight":
-                best_weight.append(param.numpy().T)
-            if name.split(".")[-1] == "bias":
-                best_bias.append(param.numpy()[np.newaxis, :])
-        if name.split(".")[0] == "layers_stacking":
-            curr_ind = int(name.split(".")[1])
-            if name.split(".")[-1] == "weight":
-                best_weight_stack[curr_ind].append(param.numpy().T)
-            if name.split(".")[-1] == "bias":
-                best_bias_stack[curr_ind].append(param.numpy()[np.newaxis, :])
-
-    return [
-        best_weight,
-        best_bias,
-        best_loss,
-        best_weight_stack,
-        best_bias_stack,
-    ]
-
-
-def compute_imp_std(pred_scores):
-    weights = np.array([el.shape[-2] for el in pred_scores])
-    # Compute the mean of each fold over the number of observations
-    pred_mean = np.array([np.mean(el.copy(), axis=-2) for el in pred_scores])
-
-    # Weighted average
-    imp = np.average(pred_mean, axis=0, weights=weights)
-
-    # Compute the standard deviation of each fold
-    # over the number of observations
-    pred_std = np.array(
-        [
-            np.mean(
-                (el - imp[..., np.newaxis]) ** 2,
-                axis=-2,
-            )
-            for el in pred_scores
-        ]
-    )
-    std = np.sqrt(np.average(pred_std, axis=0, weights=weights) / (np.sum(weights) - 1))
-    return (imp, std)
+        )
\ No newline at end of file

From 1e546e01207591764c519cb74d4e872c169c0d55 Mon Sep 17 00:00:00 2001
From: kusch lionel <lionel.a.kusch@inria.fr>
Date: Fri, 13 Dec 2024 14:31:50 +0100
Subject: [PATCH 03/19] Modifier Random forest for using their last function

---
 .../test/test_RandomForestModified.py         | 102 ++++++++++++++++++
 .../estimator/RandomForestModified.py         |  28 ++---
 2 files changed, 117 insertions(+), 13 deletions(-)
 create mode 100644 hidimstat/estimator/test/test_RandomForestModified.py

diff --git a/hidimstat/estimator/test/test_RandomForestModified.py b/hidimstat/estimator/test/test_RandomForestModified.py
new file mode 100644
index 000000000..e625316e3
--- /dev/null
+++ b/hidimstat/estimator/test/test_RandomForestModified.py
@@ -0,0 +1,102 @@
+from hidimstat.estimator.test._utils_test import generate_data
+from hidimstat.estimator.RandomForestModified import RandomForestClassifierModified, RandomForestRegressorModified
+import numpy as np
+
+
+def test_RandomForestRegressorModified():
+    """
+    Test the RandomForestRegressorModified for regression.
+    Parameters:
+    - regression_data: A tuple containing the input features (X) and target variable (y) for regression.
+    """
+    X, y = generate_data(problem_type="regression")
+    learner = RandomForestRegressorModified(n_jobs=10, verbose=0)
+    learner.fit(X, y)
+    predict = learner.predict(X)
+    # Check if the predicted values are close to the true values for at least one instance
+    assert np.max(np.abs(predict-y)) < 200.0
+    # Check if the predicted values are close to the true values for at least one instance
+    assert np.all(predict == y) or np.any(predict != y)
+    # Check if the predicted values are not all the same
+    assert not np.all(predict == predict[0])
+    # Check if the predicted values are not all zeros
+    assert not np.all(predict == 0)
+    # Check if the predicted values are not all ones
+    assert not np.all(predict == 1)
+    # Check if the feature importances are not all zeros
+    assert not np.all(learner.feature_importances_ == 0)
+    # Check if the feature importances are not all the same
+    assert not np.all(learner.feature_importances_ == learner.feature_importances_[0])
+    # Check if the feature importances are not all ones
+    assert not np.all(learner.feature_importances_ == 1)
+    # Check if the feature importances are not all negative
+    assert not np.all(learner.feature_importances_ < 0)
+    # # Check if the feature importances are not all positive
+    # assert not np.all(learner.feature_importances_ > 0)
+    # Check if the feature importances are not all close to zero
+    assert not np.allclose(learner.feature_importances_, 0)
+    # Check if the feature importances are not all close to one
+    assert not np.allclose(learner.feature_importances_, 1)
+
+    predictions = learner.sample_same_leaf(X)
+    #TODO: add more tests for sample_same_leaf
+
+def test_RandomForestClassifierModified():
+    """
+    Test the RandomForestClassifierModified for classification.
+    """
+    X, y = generate_data(problem_type="classification")
+    learner = RandomForestClassifierModified(n_jobs=10, verbose=0)
+    learner.fit(X, y)
+    predict_prob = learner.predict_proba(X)
+    # Check if the predicted probabilities sum up to 1 for each instance
+    assert np.allclose(np.sum(predict_prob, axis=1), 1)
+    # Check if the predicted class labels match the true labels for at least one instance
+    assert np.sum(np.argmax(predict_prob, axis=1) == y) > 0
+    assert np.all(np.max(predict_prob, axis=1) >= 0.5)
+    assert np.all(np.min(predict_prob, axis=1) < 0.5)
+    # Check if the maximum predicted probability is greater than 0.95
+    assert 0.95 < np.max(predict_prob)
+    # Check if the minimum predicted probability is less than 0.05
+    assert 0.05 > np.min(predict_prob)
+    # Check if the predicted probabilities are not all the same
+    assert not np.all(predict_prob == predict_prob[0])
+    # Check if the predicted probabilities are not all zeros
+    assert not np.all(predict_prob == 0)
+    # Check if the predicted probabilities are not all ones
+    assert not np.all(predict_prob == 1)
+    # Check if the predicted probabilities are not all the same for each class
+    assert not np.all(predict_prob[:, 0] == predict_prob[0, 0])
+    assert not np.all(predict_prob[:, 1] == predict_prob[0, 1])
+    # Check if the predicted probabilities are not all zeros for each class
+    assert not np.all(predict_prob[:, 0] == 0)
+    assert not np.all(predict_prob[:, 1] == 0)
+    # Check if the predicted probabilities are not all ones for each class
+    assert not np.all(predict_prob[:, 0] == 1)
+
+    predict = learner.predict(X)
+    # Check if the predicted values are close to the true values for at least one instance
+    assert np.all(predict == y) or np.any(predict != y)
+    # Check if the predicted values are not all the same
+    assert not np.all(predict == predict[0])
+    # Check if the predicted values are not all zeros
+    assert not np.all(predict == 0)
+    # Check if the predicted values are not all ones
+    assert not np.all(predict == 1)
+    # Check if the feature importances are not all zeros
+    assert not np.all(learner.feature_importances_ == 0)
+    # Check if the feature importances are not all the same
+    assert not np.all(learner.feature_importances_ == learner.feature_importances_[0])
+    # Check if the feature importances are not all ones
+    assert not np.all(learner.feature_importances_ == 1)
+    # Check if the feature importances are not all negative
+    assert not np.all(learner.feature_importances_ < 0)
+    # # Check if the feature importances are not all positive
+    # assert not np.all(learner.feature_importances_ > 0)
+    # Check if the feature importances are not all close to zero
+    assert not np.allclose(learner.feature_importances_, 0)
+    # Check if the feature importances are not all close to one
+    assert not np.allclose(learner.feature_importances_, 1)
+
+    predictions = learner.sample_same_leaf(X)
+    #TODO: add more tests for sample_same_leaf
\ No newline at end of file
diff --git a/src/hidimstat/estimator/RandomForestModified.py b/src/hidimstat/estimator/RandomForestModified.py
index 7dd5b12a1..097e3d473 100644
--- a/src/hidimstat/estimator/RandomForestModified.py
+++ b/src/hidimstat/estimator/RandomForestModified.py
@@ -5,13 +5,13 @@
 class RandomForestClassifierModified(RandomForestClassifier):
     def fit(self, X, y):
         self.y_ = y
-        super().fit(X, y)
+        return super().fit(X, y)
 
     def predict(self, X):
-        super().predict(X)
+        return super().predict(X)
 
     def predict_proba(self, X):
-        super().predict_proba(X)
+        return super().predict_proba(X)
 
     def sample_same_leaf(self, X, y=None):
         if not (y is None):
@@ -42,23 +42,24 @@ def sample_same_leaf(self, X, y=None):
                 )[0]
 
                 # Append the samples to the list
-                leaf_samples.append(
-                    y_minus_i[rng.choice(samples_in_leaf, size=random_samples)]
-                )
+                if samples_in_leaf.size > 0:
+                    leaf_samples.append(
+                        y_minus_i[rng.choice(samples_in_leaf, size=random_samples)]
+                    )
 
             predictions.append(leaf_samples)
 
         # Combine the predictions from all trees to make the final prediction
-        return np.array(predictions)
+        return np.array(predictions, dtype=object)
 
 
 class RandomForestRegressorModified(RandomForestRegressor):
     def fit(self, X, y):
         self.y_ = y
-        super().fit(X, y)
+        return super().fit(X, y)
 
     def predict(self, X):
-        super().predict(X)
+        return super().predict(X)
 
     def sample_same_leaf(self, X):
         rng = np.random.RandomState(self.get_params()["random_state"])
@@ -87,11 +88,12 @@ def sample_same_leaf(self, X):
                 )[0]
 
                 # Append the samples to the list
-                leaf_samples.append(
-                    y_minus_i[rng.choice(samples_in_leaf, size=random_samples)]
-                )
+                if samples_in_leaf.size > 0:
+                    leaf_samples.append(
+                        y_minus_i[rng.choice(samples_in_leaf, size=random_samples)]
+                    )
 
             predictions.append(leaf_samples)
 
         # Combine the predictions from all trees to make the final prediction
-        return np.array(predictions)
+        return np.array(predictions, dtype=object)

From c72c6fa9e330f1ea64289ac96655f4a5da6a4c71 Mon Sep 17 00:00:00 2001
From: kusch lionel <lionel.a.kusch@inria.fr>
Date: Fri, 13 Dec 2024 14:33:28 +0100
Subject: [PATCH 04/19] Add test for DNN learner

---
 hidimstat/estimator/test/_utils_test.py      | 54 ++++++++++++++++++
 hidimstat/estimator/test/test_Dnn_learner.py | 58 ++++++++++++++++++++
 2 files changed, 112 insertions(+)
 create mode 100644 hidimstat/estimator/test/_utils_test.py
 create mode 100644 hidimstat/estimator/test/test_Dnn_learner.py

diff --git a/hidimstat/estimator/test/_utils_test.py b/hidimstat/estimator/test/_utils_test.py
new file mode 100644
index 000000000..a29e849ff
--- /dev/null
+++ b/hidimstat/estimator/test/_utils_test.py
@@ -0,0 +1,54 @@
+from sklearn.datasets import make_classification, make_regression
+import numpy as np
+import pandas as pd
+import pytest
+
+
+def generate_data(
+    n_samples=200,
+    n_features=10,
+    problem_type="regression",
+    seed=2024,
+):
+    """
+    This function generates the synthetic data used in the different tests.
+    ----------
+    n_samples : int, optional
+        Number of samples to generate, by default 200
+    n_features : int, optional
+        Number of features to generate, by default 10
+    problem_type : str, optional
+        Type of problem to generate, by default "regression" (options: "regression", "classification")
+    seed : int, optional
+        Random seed, by default 2024
+    ----------
+    Returns
+    -------
+    X : pd.DataFrame
+        Data matrix
+    y : np.array
+        Target vector
+    grps : np.array
+        Group vector
+    """
+    rng = np.random.default_rng(seed)
+    if problem_type == "regression":
+        X, y = make_regression(
+            n_samples=n_samples,
+            noise=0.2,
+            n_features=n_features,
+            random_state=seed,
+        )
+    else:
+        X, y = make_classification(
+            n_samples=n_samples,
+            n_classes=2,
+            n_informative=5,
+            n_features=n_features,
+            random_state=seed,
+        )
+        #y = np.array([str(i) for i in y])
+
+    X = pd.DataFrame(X, columns=[f"col{i+1}" for i in range(n_features)])
+
+    return X, y
\ No newline at end of file
diff --git a/hidimstat/estimator/test/test_Dnn_learner.py b/hidimstat/estimator/test/test_Dnn_learner.py
new file mode 100644
index 000000000..8f131bde5
--- /dev/null
+++ b/hidimstat/estimator/test/test_Dnn_learner.py
@@ -0,0 +1,58 @@
+from hidimstat.estimator.test._utils_test import generate_data
+from hidimstat.estimator.Dnn_learner import DnnLearner
+import numpy as np
+
+
+def test_DNN_regression():
+    """
+    Test the DNN learner for regression.
+    """
+    X, y = generate_data(problem_type="regression")
+    learner = DnnLearner(do_hypertuning=True, problem_type="regression", n_jobs=10, verbose=0)
+    learner.fit(X, np.expand_dims(y, axis=1))
+    predict = learner.predict(X)[0, :, 0]
+    # Check if the predicted values are close to the true values for at least one instance
+    assert np.max(np.abs(predict-y)) < 4.0
+    # Check if the predicted values are close to the true values for at least one instance
+    assert np.all(predict == y) or np.any(predict != y)
+    # Check if the predicted values are not all the same
+    assert not np.all(predict == predict[0])
+    # Check if the predicted values are not all zeros
+    assert not np.all(predict == 0)
+    # Check if the predicted values are not all ones
+    assert not np.all(predict == 1)
+
+
+def test_DNN_classification():
+    """
+    Test the DNN learner for classification.
+    """
+    X, y = generate_data(problem_type="classification")
+    learner = DnnLearner(do_hypertuning=True, problem_type="classification", n_jobs=10, verbose=0)
+    learner.fit(X, np.expand_dims(y, axis=1))
+    predict_prob = learner.predict_proba(X)
+    # Check if the predicted probabilities sum up to 1 for each instance
+    assert np.allclose(np.sum(predict_prob, axis=1), 1)
+    # Check if the predicted class labels match the true labels for at least one instance
+    assert np.sum(np.argmax(predict_prob, axis=1) == y) > 0
+    assert np.all(np.max(predict_prob, axis=1) >= 0.5)
+    assert np.all(np.min(predict_prob, axis=1) < 0.5)
+    # Check if the maximum predicted probability is greater than 0.95
+    assert 0.95 < np.max(predict_prob)
+    # Check if the minimum predicted probability is less than 0.05
+    assert 0.05 > np.min(predict_prob)
+    # Check if the predicted probabilities are not all the same
+    assert not np.all(predict_prob == predict_prob[0])
+    # Check if the predicted probabilities are not all zeros
+    assert not np.all(predict_prob == 0)
+    # Check if the predicted probabilities are not all ones
+    assert not np.all(predict_prob == 1)
+    # Check if the predicted probabilities are not all the same for each class
+    assert not np.all(predict_prob[:, 0] == predict_prob[0, 0])
+    assert not np.all(predict_prob[:, 1] == predict_prob[0, 1])
+    # Check if the predicted probabilities are not all zeros for each class
+    assert not np.all(predict_prob[:, 0] == 0)
+    assert not np.all(predict_prob[:, 1] == 0)
+    # Check if the predicted probabilities are not all ones for each class
+    assert not np.all(predict_prob[:, 0] == 1)
+

From 5bbbc036c48936ea59871359ac468dac9961b74a Mon Sep 17 00:00:00 2001
From: kusch lionel <lionel.a.kusch@inria.fr>
Date: Fri, 13 Dec 2024 14:33:58 +0100
Subject: [PATCH 05/19] Add test for Dnn_learner_single and fix some bugs

---
 .../estimator/test/test_Dnn_learner_single.py | 103 ++++++++++++++++++
 src/hidimstat/estimator/Dnn_learner_single.py |  14 +--
 2 files changed, 109 insertions(+), 8 deletions(-)
 create mode 100644 hidimstat/estimator/test/test_Dnn_learner_single.py

diff --git a/hidimstat/estimator/test/test_Dnn_learner_single.py b/hidimstat/estimator/test/test_Dnn_learner_single.py
new file mode 100644
index 000000000..4f28b51de
--- /dev/null
+++ b/hidimstat/estimator/test/test_Dnn_learner_single.py
@@ -0,0 +1,103 @@
+from hidimstat.estimator.test._utils_test import generate_data
+from hidimstat.estimator.Dnn_learner_single import DnnLearnerSingle
+import numpy as np
+
+
+def test_DNN_single_regression():
+    """
+    Test the DNN learner single for regression.
+    """
+    X, y = generate_data(problem_type="regression")
+    learner = DnnLearnerSingle(do_hypertuning=True, problem_type="regression", n_jobs=10, verbose=0)
+    learner.fit(X, y)
+    predict = learner.predict(X)[:,0]
+    # Check if the predicted values are close to the true values for at least one instance
+    assert np.max(np.abs(predict-y)) < 4.0
+    # Check if the predicted values are close to the true values for at least one instance
+    assert np.all(predict == y) or np.any(predict != y)
+    # Check if the predicted values are not all the same
+    assert not np.all(predict == predict[0])
+    # Check if the predicted values are not all zeros
+    assert not np.all(predict == 0)
+    # Check if the predicted values are not all ones
+    assert not np.all(predict == 1)
+
+
+def test_DNN_single_classification():
+    """
+    Test the DNN learner single for classification.
+    """
+    X, y = generate_data(problem_type="classification")
+    learner = DnnLearnerSingle(do_hypertuning=True, problem_type="classification", n_jobs=10, verbose=0)
+    learner.fit(X, y)
+    predict_prob = learner.predict_proba(X)
+    # Check if the predicted probabilities sum up to 1 for each instance
+    assert np.allclose(np.sum(predict_prob, axis=1), 1)
+    # Check if the predicted class labels match the true labels for at least one instance
+    assert np.sum(np.argmax(predict_prob, axis=1) == y) > 0
+    assert np.all(np.max(predict_prob, axis=1) >= 0.5)
+    assert np.all(np.min(predict_prob, axis=1) < 0.5)
+    # Check if the maximum predicted probability is greater than 0.95
+    assert 0.95 < np.max(predict_prob)
+    # Check if the minimum predicted probability is less than 0.05
+    assert 0.05 > np.min(predict_prob)
+    # Check if the predicted probabilities are not all the same
+    assert not np.all(predict_prob == predict_prob[0])
+    # Check if the predicted probabilities are not all zeros
+    assert not np.all(predict_prob == 0)
+    # Check if the predicted probabilities are not all ones
+    assert not np.all(predict_prob == 1)
+    # Check if the predicted probabilities are not all the same for each class
+    assert not np.all(predict_prob[:, 0] == predict_prob[0, 0])
+    assert not np.all(predict_prob[:, 1] == predict_prob[0, 1])
+    # Check if the predicted probabilities are not all zeros for each class
+    assert not np.all(predict_prob[:, 0] == 0)
+    assert not np.all(predict_prob[:, 1] == 0)
+    # Check if the predicted probabilities are not all ones for each class
+    assert not np.all(predict_prob[:, 0] == 1)
+    
+
+def test_DNN_single_binary():
+    """
+    Test the DNN learner single for binary classification.
+    """
+    X, y = generate_data(problem_type="classification")
+    learner = DnnLearnerSingle(do_hypertuning=True, problem_type="binary", n_jobs=10, verbose=0)
+    learner.fit(X, y)
+    predict_prob = learner.predict_proba(X)
+    # Check if the predicted probabilities sum up to 1 for each instance
+    assert np.allclose(np.sum(predict_prob, axis=1), 1)
+    # Check if the predicted class labels match the true labels for at least one instance
+    assert np.sum(np.argmax(predict_prob, axis=1) == y) > 0
+    assert np.all(np.max(predict_prob, axis=1) >= 0.5)
+    assert np.all(np.min(predict_prob, axis=1) < 0.5)
+    # Check if the maximum predicted probability is greater than 0.95
+    assert 0.95 < np.max(predict_prob)
+    # Check if the minimum predicted probability is less than 0.05
+    assert 0.05 > np.min(predict_prob)
+    # Check if the predicted probabilities are not all the same
+    assert not np.all(predict_prob == predict_prob[0])
+    # Check if the predicted probabilities are not all zeros
+    assert not np.all(predict_prob == 0)
+    # Check if the predicted probabilities are not all ones
+    assert not np.all(predict_prob == 1)
+    # Check if the predicted probabilities are not all the same for each class
+    assert not np.all(predict_prob[:, 0] == predict_prob[0, 0])
+    assert not np.all(predict_prob[:, 1] == predict_prob[0, 1])
+    # Check if the predicted probabilities are not all zeros for each class
+    assert not np.all(predict_prob[:, 0] == 0)
+    assert not np.all(predict_prob[:, 1] == 0)
+    # Check if the predicted probabilities are not all ones for each class
+    assert not np.all(predict_prob[:, 0] == 1)
+
+
+def test_DNN_single_ordinal():
+    """
+    Test the DNN learner single for ordinal.
+    """
+    X, y = generate_data(problem_type="classification")
+    learner = DnnLearnerSingle(do_hypertuning=True, problem_type="ordinal", n_jobs=10, verbose=0)
+    learner.fit(X, y)
+    predict_prob = learner.predict_proba(X)[:,0]
+    # Check if the predicted class labels match the true labels for at least one instance
+    #assert np.sum(np.abs((np.around(predict_prob)[:,0]-y))) == 0
\ No newline at end of file
diff --git a/src/hidimstat/estimator/Dnn_learner_single.py b/src/hidimstat/estimator/Dnn_learner_single.py
index ddffe5bb8..7b2ee8a45 100644
--- a/src/hidimstat/estimator/Dnn_learner_single.py
+++ b/src/hidimstat/estimator/Dnn_learner_single.py
@@ -8,7 +8,7 @@
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.utils.validation import check_is_fitted
 
-from .utils import (
+from ._utils.u_Dnn_learner import (
     create_X_y,
     dnn_net,
     joblib_ensemble_dnnet,
@@ -241,6 +241,7 @@ def fit(self, X, y=None):
         loss = np.array(res_ens[4])
 
         if self.n_ensemble == 1:
+            raise Warning("The model can't be fit with n_ensemble = 1")
             return [(res_ens[0][0], (res_ens[1][0], res_ens[2][0]))]
 
         # Keeping the optimal subset of DNNs
@@ -283,6 +284,9 @@ def encode_outcome(self, y, train=True):
             y = y.reshape(-1, 1)
         if self.problem_type == "regression":
             list_y.append(y)
+        # Encoding the target with the ordinal case
+        if self.problem_type == "ordinal":
+            list_y = ordinal_encode(y)
 
         for col in range(y.shape[1]):
             if train:
@@ -291,18 +295,12 @@ def encode_outcome(self, y, train=True):
                     self.enc_y.append(OneHotEncoder(handle_unknown="ignore"))
                     curr_y = self.enc_y[col].fit_transform(y[:, [col]]).toarray()
                     list_y.append(curr_y)
-
-                # Encoding the target with the ordinal case
-                if self.problem_type == "ordinal":
-                    y = ordinal_encode(y)
-
             else:
                 # Encoding the target with the classification case
                 if self.problem_type in ("classification", "binary"):
                     curr_y = self.enc_y[col].transform(y[:, [col]]).toarray()
                     list_y.append(curr_y)
-
-                ## ToDo Add the ordinal case
+                
         return np.array(list_y)
 
     def hyper_tuning(

From 8a806f4e8a52ca19cadacb905f85dc68bc6c92e5 Mon Sep 17 00:00:00 2001
From: kusch lionel <lionel.a.kusch@inria.fr>
Date: Fri, 13 Dec 2024 14:35:46 +0100
Subject: [PATCH 06/19] Ignore coverage files

---
 .gitignore | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index a13abe82b..425d3aba2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,7 +9,7 @@ joblib
 *.pyc
 __pycache__
 *.egg-info
-.coverage
+.coverage*
 
 # documenation 
 doc

From 8fc0e8220049e501404b3f2e809bf1eff6ebe621 Mon Sep 17 00:00:00 2001
From: kusch lionel <lionel.a.kusch@inria.fr>
Date: Fri, 13 Dec 2024 14:59:44 +0100
Subject: [PATCH 07/19] Missing a file for test

---
 hidimstat/estimator/test/__init__.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 hidimstat/estimator/test/__init__.py

diff --git a/hidimstat/estimator/test/__init__.py b/hidimstat/estimator/test/__init__.py
new file mode 100644
index 000000000..e69de29bb

From d91f78ba03734e85eb7ac057a1998376beb8fb15 Mon Sep 17 00:00:00 2001
From: kusch lionel <lionel.a.kusch@inria.fr>
Date: Fri, 13 Dec 2024 16:25:36 +0100
Subject: [PATCH 08/19] Modified coverage configuration for new tests

---
 hidimstat/estimator/{test => tests}/__init__.py                  | 0
 hidimstat/estimator/{test => tests}/_utils_test.py               | 0
 hidimstat/estimator/{test => tests}/test_Dnn_learner.py          | 0
 hidimstat/estimator/{test => tests}/test_Dnn_learner_single.py   | 0
 hidimstat/estimator/{test => tests}/test_RandomForestModified.py | 0
 5 files changed, 0 insertions(+), 0 deletions(-)
 rename hidimstat/estimator/{test => tests}/__init__.py (100%)
 rename hidimstat/estimator/{test => tests}/_utils_test.py (100%)
 rename hidimstat/estimator/{test => tests}/test_Dnn_learner.py (100%)
 rename hidimstat/estimator/{test => tests}/test_Dnn_learner_single.py (100%)
 rename hidimstat/estimator/{test => tests}/test_RandomForestModified.py (100%)

diff --git a/hidimstat/estimator/test/__init__.py b/hidimstat/estimator/tests/__init__.py
similarity index 100%
rename from hidimstat/estimator/test/__init__.py
rename to hidimstat/estimator/tests/__init__.py
diff --git a/hidimstat/estimator/test/_utils_test.py b/hidimstat/estimator/tests/_utils_test.py
similarity index 100%
rename from hidimstat/estimator/test/_utils_test.py
rename to hidimstat/estimator/tests/_utils_test.py
diff --git a/hidimstat/estimator/test/test_Dnn_learner.py b/hidimstat/estimator/tests/test_Dnn_learner.py
similarity index 100%
rename from hidimstat/estimator/test/test_Dnn_learner.py
rename to hidimstat/estimator/tests/test_Dnn_learner.py
diff --git a/hidimstat/estimator/test/test_Dnn_learner_single.py b/hidimstat/estimator/tests/test_Dnn_learner_single.py
similarity index 100%
rename from hidimstat/estimator/test/test_Dnn_learner_single.py
rename to hidimstat/estimator/tests/test_Dnn_learner_single.py
diff --git a/hidimstat/estimator/test/test_RandomForestModified.py b/hidimstat/estimator/tests/test_RandomForestModified.py
similarity index 100%
rename from hidimstat/estimator/test/test_RandomForestModified.py
rename to hidimstat/estimator/tests/test_RandomForestModified.py

From d032aa25966930f8ac146719492b153eda7e212b Mon Sep 17 00:00:00 2001
From: kusch lionel <lionel.a.kusch@inria.fr>
Date: Mon, 16 Dec 2024 10:37:35 +0100
Subject: [PATCH 09/19] Fix bug in the import of test

---
 hidimstat/estimator/tests/test_Dnn_learner.py          | 2 +-
 hidimstat/estimator/tests/test_Dnn_learner_single.py   | 2 +-
 hidimstat/estimator/tests/test_RandomForestModified.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/hidimstat/estimator/tests/test_Dnn_learner.py b/hidimstat/estimator/tests/test_Dnn_learner.py
index 8f131bde5..e8dad389f 100644
--- a/hidimstat/estimator/tests/test_Dnn_learner.py
+++ b/hidimstat/estimator/tests/test_Dnn_learner.py
@@ -1,4 +1,4 @@
-from hidimstat.estimator.test._utils_test import generate_data
+from hidimstat.estimator.tests._utils_test import generate_data
 from hidimstat.estimator.Dnn_learner import DnnLearner
 import numpy as np
 
diff --git a/hidimstat/estimator/tests/test_Dnn_learner_single.py b/hidimstat/estimator/tests/test_Dnn_learner_single.py
index 4f28b51de..42037bfd2 100644
--- a/hidimstat/estimator/tests/test_Dnn_learner_single.py
+++ b/hidimstat/estimator/tests/test_Dnn_learner_single.py
@@ -1,4 +1,4 @@
-from hidimstat.estimator.test._utils_test import generate_data
+from hidimstat.estimator.tests._utils_test import generate_data
 from hidimstat.estimator.Dnn_learner_single import DnnLearnerSingle
 import numpy as np
 
diff --git a/hidimstat/estimator/tests/test_RandomForestModified.py b/hidimstat/estimator/tests/test_RandomForestModified.py
index e625316e3..37bdcf8bf 100644
--- a/hidimstat/estimator/tests/test_RandomForestModified.py
+++ b/hidimstat/estimator/tests/test_RandomForestModified.py
@@ -1,4 +1,4 @@
-from hidimstat.estimator.test._utils_test import generate_data
+from hidimstat.estimator.tests._utils_test import generate_data
 from hidimstat.estimator.RandomForestModified import RandomForestClassifierModified, RandomForestRegressorModified
 import numpy as np
 

From b1638a3ca6892d1e2fac792e9af85ed877dfa84b Mon Sep 17 00:00:00 2001
From: kusch lionel <lionel.a.kusch@inria.fr>
Date: Thu, 19 Dec 2024 17:27:49 +0100
Subject: [PATCH 10/19] Fix some typos

---
 hidimstat/estimator/tests/_utils_test.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/hidimstat/estimator/tests/_utils_test.py b/hidimstat/estimator/tests/_utils_test.py
index a29e849ff..7d50cea92 100644
--- a/hidimstat/estimator/tests/_utils_test.py
+++ b/hidimstat/estimator/tests/_utils_test.py
@@ -1,7 +1,6 @@
 from sklearn.datasets import make_classification, make_regression
 import numpy as np
 import pandas as pd
-import pytest
 
 
 def generate_data(
@@ -12,6 +11,8 @@ def generate_data(
 ):
     """
     This function generates the synthetic data used in the different tests.
+    
+    Parameters
     ----------
     n_samples : int, optional
         Number of samples to generate, by default 200
@@ -21,7 +22,7 @@ def generate_data(
         Type of problem to generate, by default "regression" (options: "regression", "classification")
     seed : int, optional
         Random seed, by default 2024
-    ----------
+
     Returns
     -------
     X : pd.DataFrame

From e0075fcfbe2cfa58536689cd0f61c9e9c2b0177c Mon Sep 17 00:00:00 2001
From: kusch lionel <lionel.a.kusch@inria.fr>
Date: Thu, 19 Dec 2024 17:30:17 +0100
Subject: [PATCH 11/19] Formating files

---
 hidimstat/estimator/_utils/u_Dnn_learner.py   |  4 +--
 hidimstat/estimator/tests/_utils_test.py      |  6 ++---
 hidimstat/estimator/tests/test_Dnn_learner.py | 11 +++++---
 .../tests/test_Dnn_learner_single.py          | 26 ++++++++++++-------
 .../tests/test_RandomForestModified.py        | 12 ++++++---
 src/hidimstat/estimator/Dnn_learner_single.py |  2 +-
 6 files changed, 37 insertions(+), 24 deletions(-)

diff --git a/hidimstat/estimator/_utils/u_Dnn_learner.py b/hidimstat/estimator/_utils/u_Dnn_learner.py
index 150e3f262..7008b0313 100644
--- a/hidimstat/estimator/_utils/u_Dnn_learner.py
+++ b/hidimstat/estimator/_utils/u_Dnn_learner.py
@@ -1,4 +1,3 @@
-
 import numpy as np
 import copy
 import torch
@@ -9,7 +8,6 @@
 from sklearn.preprocessing import StandardScaler
 
 
-
 def create_X_y(
     X,
     y,
@@ -654,4 +652,4 @@ def dnn_net(
         best_loss,
         best_weight_stack,
         best_bias_stack,
-    ]
\ No newline at end of file
+    ]
diff --git a/hidimstat/estimator/tests/_utils_test.py b/hidimstat/estimator/tests/_utils_test.py
index 7d50cea92..bef030fde 100644
--- a/hidimstat/estimator/tests/_utils_test.py
+++ b/hidimstat/estimator/tests/_utils_test.py
@@ -11,7 +11,7 @@ def generate_data(
 ):
     """
     This function generates the synthetic data used in the different tests.
-    
+
     Parameters
     ----------
     n_samples : int, optional
@@ -48,8 +48,8 @@ def generate_data(
             n_features=n_features,
             random_state=seed,
         )
-        #y = np.array([str(i) for i in y])
+        # y = np.array([str(i) for i in y])
 
     X = pd.DataFrame(X, columns=[f"col{i+1}" for i in range(n_features)])
 
-    return X, y
\ No newline at end of file
+    return X, y
diff --git a/hidimstat/estimator/tests/test_Dnn_learner.py b/hidimstat/estimator/tests/test_Dnn_learner.py
index e8dad389f..1da5b4da5 100644
--- a/hidimstat/estimator/tests/test_Dnn_learner.py
+++ b/hidimstat/estimator/tests/test_Dnn_learner.py
@@ -8,11 +8,13 @@ def test_DNN_regression():
     Test the DNN learner for regression.
     """
     X, y = generate_data(problem_type="regression")
-    learner = DnnLearner(do_hypertuning=True, problem_type="regression", n_jobs=10, verbose=0)
+    learner = DnnLearner(
+        do_hypertuning=True, problem_type="regression", n_jobs=10, verbose=0
+    )
     learner.fit(X, np.expand_dims(y, axis=1))
     predict = learner.predict(X)[0, :, 0]
     # Check if the predicted values are close to the true values for at least one instance
-    assert np.max(np.abs(predict-y)) < 4.0
+    assert np.max(np.abs(predict - y)) < 4.0
     # Check if the predicted values are close to the true values for at least one instance
     assert np.all(predict == y) or np.any(predict != y)
     # Check if the predicted values are not all the same
@@ -28,7 +30,9 @@ def test_DNN_classification():
     Test the DNN learner for classification.
     """
     X, y = generate_data(problem_type="classification")
-    learner = DnnLearner(do_hypertuning=True, problem_type="classification", n_jobs=10, verbose=0)
+    learner = DnnLearner(
+        do_hypertuning=True, problem_type="classification", n_jobs=10, verbose=0
+    )
     learner.fit(X, np.expand_dims(y, axis=1))
     predict_prob = learner.predict_proba(X)
     # Check if the predicted probabilities sum up to 1 for each instance
@@ -55,4 +59,3 @@ def test_DNN_classification():
     assert not np.all(predict_prob[:, 1] == 0)
     # Check if the predicted probabilities are not all ones for each class
     assert not np.all(predict_prob[:, 0] == 1)
-
diff --git a/hidimstat/estimator/tests/test_Dnn_learner_single.py b/hidimstat/estimator/tests/test_Dnn_learner_single.py
index 42037bfd2..4c4f5fc4d 100644
--- a/hidimstat/estimator/tests/test_Dnn_learner_single.py
+++ b/hidimstat/estimator/tests/test_Dnn_learner_single.py
@@ -8,11 +8,13 @@ def test_DNN_single_regression():
     Test the DNN learner single for regression.
     """
     X, y = generate_data(problem_type="regression")
-    learner = DnnLearnerSingle(do_hypertuning=True, problem_type="regression", n_jobs=10, verbose=0)
+    learner = DnnLearnerSingle(
+        do_hypertuning=True, problem_type="regression", n_jobs=10, verbose=0
+    )
     learner.fit(X, y)
-    predict = learner.predict(X)[:,0]
+    predict = learner.predict(X)[:, 0]
     # Check if the predicted values are close to the true values for at least one instance
-    assert np.max(np.abs(predict-y)) < 4.0
+    assert np.max(np.abs(predict - y)) < 4.0
     # Check if the predicted values are close to the true values for at least one instance
     assert np.all(predict == y) or np.any(predict != y)
     # Check if the predicted values are not all the same
@@ -28,7 +30,9 @@ def test_DNN_single_classification():
     Test the DNN learner single for classification.
     """
     X, y = generate_data(problem_type="classification")
-    learner = DnnLearnerSingle(do_hypertuning=True, problem_type="classification", n_jobs=10, verbose=0)
+    learner = DnnLearnerSingle(
+        do_hypertuning=True, problem_type="classification", n_jobs=10, verbose=0
+    )
     learner.fit(X, y)
     predict_prob = learner.predict_proba(X)
     # Check if the predicted probabilities sum up to 1 for each instance
@@ -55,14 +59,16 @@ def test_DNN_single_classification():
     assert not np.all(predict_prob[:, 1] == 0)
     # Check if the predicted probabilities are not all ones for each class
     assert not np.all(predict_prob[:, 0] == 1)
-    
+
 
 def test_DNN_single_binary():
     """
     Test the DNN learner single for binary classification.
     """
     X, y = generate_data(problem_type="classification")
-    learner = DnnLearnerSingle(do_hypertuning=True, problem_type="binary", n_jobs=10, verbose=0)
+    learner = DnnLearnerSingle(
+        do_hypertuning=True, problem_type="binary", n_jobs=10, verbose=0
+    )
     learner.fit(X, y)
     predict_prob = learner.predict_proba(X)
     # Check if the predicted probabilities sum up to 1 for each instance
@@ -96,8 +102,10 @@ def test_DNN_single_ordinal():
     Test the DNN learner single for ordinal.
     """
     X, y = generate_data(problem_type="classification")
-    learner = DnnLearnerSingle(do_hypertuning=True, problem_type="ordinal", n_jobs=10, verbose=0)
+    learner = DnnLearnerSingle(
+        do_hypertuning=True, problem_type="ordinal", n_jobs=10, verbose=0
+    )
     learner.fit(X, y)
-    predict_prob = learner.predict_proba(X)[:,0]
+    predict_prob = learner.predict_proba(X)[:, 0]
     # Check if the predicted class labels match the true labels for at least one instance
-    #assert np.sum(np.abs((np.around(predict_prob)[:,0]-y))) == 0
\ No newline at end of file
+    # assert np.sum(np.abs((np.around(predict_prob)[:,0]-y))) == 0
diff --git a/hidimstat/estimator/tests/test_RandomForestModified.py b/hidimstat/estimator/tests/test_RandomForestModified.py
index 37bdcf8bf..a7452c706 100644
--- a/hidimstat/estimator/tests/test_RandomForestModified.py
+++ b/hidimstat/estimator/tests/test_RandomForestModified.py
@@ -1,5 +1,8 @@
 from hidimstat.estimator.tests._utils_test import generate_data
-from hidimstat.estimator.RandomForestModified import RandomForestClassifierModified, RandomForestRegressorModified
+from hidimstat.estimator.RandomForestModified import (
+    RandomForestClassifierModified,
+    RandomForestRegressorModified,
+)
 import numpy as np
 
 
@@ -14,7 +17,7 @@ def test_RandomForestRegressorModified():
     learner.fit(X, y)
     predict = learner.predict(X)
     # Check if the predicted values are close to the true values for at least one instance
-    assert np.max(np.abs(predict-y)) < 200.0
+    assert np.max(np.abs(predict - y)) < 200.0
     # Check if the predicted values are close to the true values for at least one instance
     assert np.all(predict == y) or np.any(predict != y)
     # Check if the predicted values are not all the same
@@ -39,7 +42,8 @@ def test_RandomForestRegressorModified():
     assert not np.allclose(learner.feature_importances_, 1)
 
     predictions = learner.sample_same_leaf(X)
-    #TODO: add more tests for sample_same_leaf
+    # TODO: add more tests for sample_same_leaf
+
 
 def test_RandomForestClassifierModified():
     """
@@ -99,4 +103,4 @@ def test_RandomForestClassifierModified():
     assert not np.allclose(learner.feature_importances_, 1)
 
     predictions = learner.sample_same_leaf(X)
-    #TODO: add more tests for sample_same_leaf
\ No newline at end of file
+    # TODO: add more tests for sample_same_leaf
diff --git a/src/hidimstat/estimator/Dnn_learner_single.py b/src/hidimstat/estimator/Dnn_learner_single.py
index 7b2ee8a45..9f487771b 100644
--- a/src/hidimstat/estimator/Dnn_learner_single.py
+++ b/src/hidimstat/estimator/Dnn_learner_single.py
@@ -300,7 +300,7 @@ def encode_outcome(self, y, train=True):
                 if self.problem_type in ("classification", "binary"):
                     curr_y = self.enc_y[col].transform(y[:, [col]]).toarray()
                     list_y.append(curr_y)
-                
+
         return np.array(list_y)
 
     def hyper_tuning(

From 6217732c6b2a5fc8825b41fc08ff27e20c190089 Mon Sep 17 00:00:00 2001
From: kusch lionel <lionel.a.kusch@inria.fr>
Date: Thu, 19 Dec 2024 17:31:32 +0100
Subject: [PATCH 12/19] Change name of the files

---
 hidimstat/estimator/_utils/{u_Dnn_learner.py => Dnn.py} | 0
 src/hidimstat/estimator/Dnn_learner_single.py           | 2 +-
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename hidimstat/estimator/_utils/{u_Dnn_learner.py => Dnn.py} (100%)

diff --git a/hidimstat/estimator/_utils/u_Dnn_learner.py b/hidimstat/estimator/_utils/Dnn.py
similarity index 100%
rename from hidimstat/estimator/_utils/u_Dnn_learner.py
rename to hidimstat/estimator/_utils/Dnn.py
diff --git a/src/hidimstat/estimator/Dnn_learner_single.py b/src/hidimstat/estimator/Dnn_learner_single.py
index 9f487771b..6ffea2c12 100644
--- a/src/hidimstat/estimator/Dnn_learner_single.py
+++ b/src/hidimstat/estimator/Dnn_learner_single.py
@@ -8,7 +8,7 @@
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.utils.validation import check_is_fitted
 
-from ._utils.u_Dnn_learner import (
+from ._utils.Dnn import (
     create_X_y,
     dnn_net,
     joblib_ensemble_dnnet,

From 3e2fbe00e6ac6a62df0bf7b90df08274788ae2ac Mon Sep 17 00:00:00 2001
From: kusch lionel <lionel.a.kusch@inria.fr>
Date: Thu, 19 Dec 2024 18:19:50 +0100
Subject: [PATCH 13/19] formart utils

---
 hidimstat/utils.py | 174 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 174 insertions(+)
 create mode 100644 hidimstat/utils.py

diff --git a/hidimstat/utils.py b/hidimstat/utils.py
new file mode 100644
index 000000000..ad8bd8741
--- /dev/null
+++ b/hidimstat/utils.py
@@ -0,0 +1,174 @@
+import numpy as np
+
+
+def quantile_aggregation(pvals, gamma=0.5, gamma_min=0.05, adaptive=False):
+    """
+    This function implements the quantile aggregation method for p-values.
+    """
+    # if pvalues are one-dimensional, do nothing
+    if pvals.shape[0] == 1:
+        return pvals[0]
+    if adaptive:
+        return _adaptive_quantile_aggregation(pvals, gamma_min)
+    else:
+        return _fixed_quantile_aggregation(pvals, gamma)
+
+
+def fdr_threshold(pvals, fdr=0.1, method="bhq", reshaping_function=None):
+    if method == "bhq":
+        return _bhq_threshold(pvals, fdr=fdr)
+    elif method == "bhy":
+        return _bhy_threshold(pvals, fdr=fdr, reshaping_function=reshaping_function)
+    elif method == "ebh":
+        return _ebh_threshold(pvals, fdr=fdr)
+    else:
+        raise ValueError("{} is not support FDR control method".format(method))
+
+
+def cal_fdp_power(selected, non_zero_index, r_index=False):
+    """Calculate power and False Discovery Proportion
+
+    Parameters
+    ----------
+    selected: list index (in R format) of selected non-null variables
+    non_zero_index: true index of non-null variables
+    r_index : True if the index is taken from rpy2 inference
+
+    Returns
+    -------
+    fdp: False Discoveries Proportion
+    power: percentage of correctly selected variables over total number of
+        non-null variables
+
+    """
+    # selected is the index list in R and will be different from index of
+    # python by 1 unit
+
+    if selected.size == 0:
+        return 0.0, 0.0
+
+    n_positives = len(non_zero_index)
+
+    if r_index:
+        selected = selected - 1
+
+    true_positive = np.intersect1d(selected, non_zero_index)
+    false_positive = np.setdiff1d(selected, true_positive)
+
+    fdp = len(false_positive) / max(1, len(selected))
+    power = min(len(true_positive), n_positives) / n_positives
+
+    return fdp, power
+
+
+def _bhq_threshold(pvals, fdr=0.1):
+    """Standard Benjamini-Hochberg for controlling False discovery rate"""
+    n_features = len(pvals)
+    pvals_sorted = np.sort(pvals)
+    selected_index = 2 * n_features
+    for i in range(n_features - 1, -1, -1):
+        if pvals_sorted[i] <= fdr * (i + 1) / n_features:
+            selected_index = i
+            break
+    if selected_index <= n_features:
+        return pvals_sorted[selected_index]
+    else:
+        return -1.0
+
+
+def _ebh_threshold(evals, fdr=0.1):
+    """e-BH procedure for FDR control (see Wang and Ramdas 2021)"""
+    n_features = len(evals)
+    evals_sorted = -np.sort(-evals)  # sort in descending order
+    selected_index = 2 * n_features
+    for i in range(n_features - 1, -1, -1):
+        if evals_sorted[i] >= n_features / (fdr * (i + 1)):
+            selected_index = i
+            break
+    if selected_index <= n_features:
+        return evals_sorted[selected_index]
+    else:
+        return np.infty
+
+
+def _bhy_threshold(pvals, reshaping_function=None, fdr=0.1):
+    """Benjamini-Hochberg-Yekutieli procedure for controlling FDR, with input
+    shape function. Reference: Ramdas et al (2017)
+    """
+    n_features = len(pvals)
+    pvals_sorted = np.sort(pvals)
+    selected_index = 2 * n_features
+    # Default value for reshaping function -- defined in
+    # Benjamini & Yekutieli (2001)
+    if reshaping_function is None:
+        temp = np.arange(n_features)
+        sum_inverse = np.sum(1 / (temp + 1))
+        return _bhq_threshold(pvals, fdr / sum_inverse)
+    else:
+        for i in range(n_features - 1, -1, -1):
+            if pvals_sorted[i] <= fdr * reshaping_function(i + 1) / n_features:
+                selected_index = i
+                break
+        if selected_index <= n_features:
+            return pvals_sorted[selected_index]
+        else:
+            return -1.0
+
+
+def _fixed_quantile_aggregation(pvals, gamma=0.5):
+    """Quantile aggregation function based on Meinshausen et al (2008)
+
+    Parameters
+    ----------
+    pvals : 2D ndarray (n_sampling_with_repetition, n_test)
+        p-value (adjusted)
+
+    gamma : float
+        Percentile value used for aggregation.
+
+    Returns
+    -------
+    1D ndarray (n_tests, )
+        Vector of aggregated p-values
+    """
+    converted_score = (1 / gamma) * (np.percentile(pvals, q=100 * gamma, axis=0))
+
+    return np.minimum(1, converted_score)
+
+
+def _adaptive_quantile_aggregation(pvals, gamma_min=0.05):
+    """adaptive version of the quantile aggregation method, Meinshausen et al.
+    (2008)"""
+    gammas = np.arange(gamma_min, 1.05, 0.05)
+    list_Q = np.array([_fixed_quantile_aggregation(pvals, gamma) for gamma in gammas])
+
+    return np.minimum(1, (1 - np.log(gamma_min)) * list_Q.min(0))
+
+
+def _lambda_max(X, y, use_noise_estimate=True):
+    """Calculation of lambda_max, the smallest value of regularization parameter in
+    lasso program for non-zero coefficient
+    """
+    n_samples, _ = X.shape
+
+    if not use_noise_estimate:
+        return np.max(np.dot(X.T, y)) / n_samples
+
+    norm_y = np.linalg.norm(y, ord=2)
+    sigma_0 = (norm_y / np.sqrt(n_samples)) * 1e-3
+    sig_star = max(sigma_0, norm_y / np.sqrt(n_samples))
+
+    return np.max(np.abs(np.dot(X.T, y)) / (n_samples * sig_star))
+
+
+def _check_vim_predict_method(method):
+    """Check if the method is a valid method for variable importance measure
+    prediction"""
+    if method in ["predict", "predict_proba", "decision_function", "transform"]:
+        return method
+    else:
+        raise ValueError(
+            "The method {} is not a valid method for variable importance measure prediction".format(
+                method
+            )
+        )

From b2c706dbaa6facab10d49800e0edba011cc6a83d Mon Sep 17 00:00:00 2001
From: kusch lionel <lionel.a.kusch@inria.fr>
Date: Thu, 19 Dec 2024 19:23:58 +0100
Subject: [PATCH 14/19] Remove not necessary comments

---
 hidimstat/estimator/tests/_utils_test.py               | 3 ---
 hidimstat/estimator/tests/test_RandomForestModified.py | 2 --
 2 files changed, 5 deletions(-)

diff --git a/hidimstat/estimator/tests/_utils_test.py b/hidimstat/estimator/tests/_utils_test.py
index bef030fde..fd14ba6f7 100644
--- a/hidimstat/estimator/tests/_utils_test.py
+++ b/hidimstat/estimator/tests/_utils_test.py
@@ -29,8 +29,6 @@ def generate_data(
         Data matrix
     y : np.array
         Target vector
-    grps : np.array
-        Group vector
     """
     rng = np.random.default_rng(seed)
     if problem_type == "regression":
@@ -48,7 +46,6 @@ def generate_data(
             n_features=n_features,
             random_state=seed,
         )
-        # y = np.array([str(i) for i in y])
 
     X = pd.DataFrame(X, columns=[f"col{i+1}" for i in range(n_features)])
 
diff --git a/hidimstat/estimator/tests/test_RandomForestModified.py b/hidimstat/estimator/tests/test_RandomForestModified.py
index a7452c706..63bc84bd9 100644
--- a/hidimstat/estimator/tests/test_RandomForestModified.py
+++ b/hidimstat/estimator/tests/test_RandomForestModified.py
@@ -9,8 +9,6 @@
 def test_RandomForestRegressorModified():
     """
     Test the RandomForestRegressorModified for regression.
-    Parameters:
-    - regression_data: A tuple containing the input features (X) and target variable (y) for regression.
     """
     X, y = generate_data(problem_type="regression")
     learner = RandomForestRegressorModified(n_jobs=10, verbose=0)

From 20e09acb1c591f5fc26ca300805406d4ac7735aa Mon Sep 17 00:00:00 2001
From: kusch lionel <lionel.a.kusch@inria.fr>
Date: Fri, 7 Mar 2025 10:44:12 +0100
Subject: [PATCH 15/19] Move to src folder

---
 {hidimstat => src/hidimstat}/estimator/_utils/Dnn.py              | 0
 {hidimstat/estimator/tests => test/estimator_tests}/__init__.py   | 0
 .../estimator/tests => test/estimator_tests}/_utils_test.py       | 0
 .../estimator/tests => test/estimator_tests}/test_Dnn_learner.py  | 0
 .../tests => test/estimator_tests}/test_Dnn_learner_single.py     | 0
 .../tests => test/estimator_tests}/test_RandomForestModified.py   | 0
 6 files changed, 0 insertions(+), 0 deletions(-)
 rename {hidimstat => src/hidimstat}/estimator/_utils/Dnn.py (100%)
 rename {hidimstat/estimator/tests => test/estimator_tests}/__init__.py (100%)
 rename {hidimstat/estimator/tests => test/estimator_tests}/_utils_test.py (100%)
 rename {hidimstat/estimator/tests => test/estimator_tests}/test_Dnn_learner.py (100%)
 rename {hidimstat/estimator/tests => test/estimator_tests}/test_Dnn_learner_single.py (100%)
 rename {hidimstat/estimator/tests => test/estimator_tests}/test_RandomForestModified.py (100%)

diff --git a/hidimstat/estimator/_utils/Dnn.py b/src/hidimstat/estimator/_utils/Dnn.py
similarity index 100%
rename from hidimstat/estimator/_utils/Dnn.py
rename to src/hidimstat/estimator/_utils/Dnn.py
diff --git a/hidimstat/estimator/tests/__init__.py b/test/estimator_tests/__init__.py
similarity index 100%
rename from hidimstat/estimator/tests/__init__.py
rename to test/estimator_tests/__init__.py
diff --git a/hidimstat/estimator/tests/_utils_test.py b/test/estimator_tests/_utils_test.py
similarity index 100%
rename from hidimstat/estimator/tests/_utils_test.py
rename to test/estimator_tests/_utils_test.py
diff --git a/hidimstat/estimator/tests/test_Dnn_learner.py b/test/estimator_tests/test_Dnn_learner.py
similarity index 100%
rename from hidimstat/estimator/tests/test_Dnn_learner.py
rename to test/estimator_tests/test_Dnn_learner.py
diff --git a/hidimstat/estimator/tests/test_Dnn_learner_single.py b/test/estimator_tests/test_Dnn_learner_single.py
similarity index 100%
rename from hidimstat/estimator/tests/test_Dnn_learner_single.py
rename to test/estimator_tests/test_Dnn_learner_single.py
diff --git a/hidimstat/estimator/tests/test_RandomForestModified.py b/test/estimator_tests/test_RandomForestModified.py
similarity index 100%
rename from hidimstat/estimator/tests/test_RandomForestModified.py
rename to test/estimator_tests/test_RandomForestModified.py

From 7afc7e55ec797a4ab9bee4a560b7d0690c1ad41d Mon Sep 17 00:00:00 2001
From: kusch lionel <lionel.a.kusch@inria.fr>
Date: Fri, 7 Mar 2025 10:48:13 +0100
Subject: [PATCH 16/19] remove old folder

---
 hidimstat/utils.py | 174 ---------------------------------------------
 1 file changed, 174 deletions(-)
 delete mode 100644 hidimstat/utils.py

diff --git a/hidimstat/utils.py b/hidimstat/utils.py
deleted file mode 100644
index ad8bd8741..000000000
--- a/hidimstat/utils.py
+++ /dev/null
@@ -1,174 +0,0 @@
-import numpy as np
-
-
-def quantile_aggregation(pvals, gamma=0.5, gamma_min=0.05, adaptive=False):
-    """
-    This function implements the quantile aggregation method for p-values.
-    """
-    # if pvalues are one-dimensional, do nothing
-    if pvals.shape[0] == 1:
-        return pvals[0]
-    if adaptive:
-        return _adaptive_quantile_aggregation(pvals, gamma_min)
-    else:
-        return _fixed_quantile_aggregation(pvals, gamma)
-
-
-def fdr_threshold(pvals, fdr=0.1, method="bhq", reshaping_function=None):
-    if method == "bhq":
-        return _bhq_threshold(pvals, fdr=fdr)
-    elif method == "bhy":
-        return _bhy_threshold(pvals, fdr=fdr, reshaping_function=reshaping_function)
-    elif method == "ebh":
-        return _ebh_threshold(pvals, fdr=fdr)
-    else:
-        raise ValueError("{} is not support FDR control method".format(method))
-
-
-def cal_fdp_power(selected, non_zero_index, r_index=False):
-    """Calculate power and False Discovery Proportion
-
-    Parameters
-    ----------
-    selected: list index (in R format) of selected non-null variables
-    non_zero_index: true index of non-null variables
-    r_index : True if the index is taken from rpy2 inference
-
-    Returns
-    -------
-    fdp: False Discoveries Proportion
-    power: percentage of correctly selected variables over total number of
-        non-null variables
-
-    """
-    # selected is the index list in R and will be different from index of
-    # python by 1 unit
-
-    if selected.size == 0:
-        return 0.0, 0.0
-
-    n_positives = len(non_zero_index)
-
-    if r_index:
-        selected = selected - 1
-
-    true_positive = np.intersect1d(selected, non_zero_index)
-    false_positive = np.setdiff1d(selected, true_positive)
-
-    fdp = len(false_positive) / max(1, len(selected))
-    power = min(len(true_positive), n_positives) / n_positives
-
-    return fdp, power
-
-
-def _bhq_threshold(pvals, fdr=0.1):
-    """Standard Benjamini-Hochberg for controlling False discovery rate"""
-    n_features = len(pvals)
-    pvals_sorted = np.sort(pvals)
-    selected_index = 2 * n_features
-    for i in range(n_features - 1, -1, -1):
-        if pvals_sorted[i] <= fdr * (i + 1) / n_features:
-            selected_index = i
-            break
-    if selected_index <= n_features:
-        return pvals_sorted[selected_index]
-    else:
-        return -1.0
-
-
-def _ebh_threshold(evals, fdr=0.1):
-    """e-BH procedure for FDR control (see Wang and Ramdas 2021)"""
-    n_features = len(evals)
-    evals_sorted = -np.sort(-evals)  # sort in descending order
-    selected_index = 2 * n_features
-    for i in range(n_features - 1, -1, -1):
-        if evals_sorted[i] >= n_features / (fdr * (i + 1)):
-            selected_index = i
-            break
-    if selected_index <= n_features:
-        return evals_sorted[selected_index]
-    else:
-        return np.infty
-
-
-def _bhy_threshold(pvals, reshaping_function=None, fdr=0.1):
-    """Benjamini-Hochberg-Yekutieli procedure for controlling FDR, with input
-    shape function. Reference: Ramdas et al (2017)
-    """
-    n_features = len(pvals)
-    pvals_sorted = np.sort(pvals)
-    selected_index = 2 * n_features
-    # Default value for reshaping function -- defined in
-    # Benjamini & Yekutieli (2001)
-    if reshaping_function is None:
-        temp = np.arange(n_features)
-        sum_inverse = np.sum(1 / (temp + 1))
-        return _bhq_threshold(pvals, fdr / sum_inverse)
-    else:
-        for i in range(n_features - 1, -1, -1):
-            if pvals_sorted[i] <= fdr * reshaping_function(i + 1) / n_features:
-                selected_index = i
-                break
-        if selected_index <= n_features:
-            return pvals_sorted[selected_index]
-        else:
-            return -1.0
-
-
-def _fixed_quantile_aggregation(pvals, gamma=0.5):
-    """Quantile aggregation function based on Meinshausen et al (2008)
-
-    Parameters
-    ----------
-    pvals : 2D ndarray (n_sampling_with_repetition, n_test)
-        p-value (adjusted)
-
-    gamma : float
-        Percentile value used for aggregation.
-
-    Returns
-    -------
-    1D ndarray (n_tests, )
-        Vector of aggregated p-values
-    """
-    converted_score = (1 / gamma) * (np.percentile(pvals, q=100 * gamma, axis=0))
-
-    return np.minimum(1, converted_score)
-
-
-def _adaptive_quantile_aggregation(pvals, gamma_min=0.05):
-    """adaptive version of the quantile aggregation method, Meinshausen et al.
-    (2008)"""
-    gammas = np.arange(gamma_min, 1.05, 0.05)
-    list_Q = np.array([_fixed_quantile_aggregation(pvals, gamma) for gamma in gammas])
-
-    return np.minimum(1, (1 - np.log(gamma_min)) * list_Q.min(0))
-
-
-def _lambda_max(X, y, use_noise_estimate=True):
-    """Calculation of lambda_max, the smallest value of regularization parameter in
-    lasso program for non-zero coefficient
-    """
-    n_samples, _ = X.shape
-
-    if not use_noise_estimate:
-        return np.max(np.dot(X.T, y)) / n_samples
-
-    norm_y = np.linalg.norm(y, ord=2)
-    sigma_0 = (norm_y / np.sqrt(n_samples)) * 1e-3
-    sig_star = max(sigma_0, norm_y / np.sqrt(n_samples))
-
-    return np.max(np.abs(np.dot(X.T, y)) / (n_samples * sig_star))
-
-
-def _check_vim_predict_method(method):
-    """Check if the method is a valid method for variable importance measure
-    prediction"""
-    if method in ["predict", "predict_proba", "decision_function", "transform"]:
-        return method
-    else:
-        raise ValueError(
-            "The method {} is not a valid method for variable importance measure prediction".format(
-                method
-            )
-        )

From fd655432832e2b1f5d494c97624c9ea285e52413 Mon Sep 17 00:00:00 2001
From: kusch lionel <lionel.a.kusch@inria.fr>
Date: Fri, 7 Mar 2025 10:48:50 +0100
Subject: [PATCH 17/19] Remove unecesary function from utils

---
 src/hidimstat/utils.py | 117 +----------------------------------------
 1 file changed, 1 insertion(+), 116 deletions(-)

diff --git a/src/hidimstat/utils.py b/src/hidimstat/utils.py
index 2c112a000..4dba00627 100644
--- a/src/hidimstat/utils.py
+++ b/src/hidimstat/utils.py
@@ -1,4 +1,3 @@
-import copy
 import numpy as np
 
 
@@ -326,118 +325,4 @@ def _alpha_max(X, y, use_noise_estimate=False):
 
         alpha_max = np.max(np.abs(np.dot(X.T, y)) / (n_samples * sigma_star))
 
-    return alpha_max
-
-
-########################### Data Preprocessing ##########################
-def create_X_y(
-    X,
-    y,
-    sampling_with_repetition=True,
-    split_percentage=0.8,
-    problem_type="regression",
-    list_continuous=None,
-    random_state=None,
-):
-    """
-    Create train/valid split of input data X and target variable y
-
-    Parameters
-    ----------
-    X : {array-like, sparse matrix} of shape (n_samples, n_features)
-        The input samples before the splitting process.
-    y : ndarray, shape (n_samples, )
-        The output samples before the splitting process.
-    sampling_with_repetition : bool, default=True
-        Sampling with repetition the train part of the train/valid scheme under
-        the training set. The number of training samples in train is equal to
-        the number of instances in the training set.
-    split_percentage : float, default=0.8
-        The training/validation cut for the provided data.
-    problem_type : str, default='regression'
-        A classification or a regression problem.
-    list_continuous : list, default=[]
-        The list of continuous variables.
-    random_state : int, default=2023
-        Fixing the seeds of the random generator.
-
-    Returns
-    -------
-    X_train_scaled : {array-like, sparse matrix} of shape (n_train_samples, n_features)
-        The training input samples with scaled continuous variables.
-    y_train_scaled : {array-like} of shape (n_train_samples, )
-        The sampling_with_repetitionped training output samples scaled if continous.
-    X_validation_scaled : {array-like, sparse matrix} of shape (n_validation_samples, n_features)
-        The validation input samples with scaled continuous variables.
-    y_validation_scaled : {array-like} of shape (n_validation_samples, )
-        The validation output samples scaled if continous.
-    X_scaled : {array-like, sparse matrix} of shape (n_samples, n_features)
-        The original input samples with scaled continuous variables.
-    y_validation : {array-like} of shape (n_samples, )
-        The original output samples with validation indices.
-    scaler_x : Scikit-learn StandardScaler
-        The standard scaler encoder for the continuous variables of the input.
-    scaler_y : Scikit-learn StandardScaler
-        The standard scaler encoder for the output if continuous.
-    valid_ind : list
-        The list of indices of the validation set.
-    """
-    rng = np.random.RandomState(random_state)
-    scaler_x, scaler_y = StandardScaler(), StandardScaler()
-    n = X.shape[0]
-
-    if sampling_with_repetition:
-        train_ind = rng.choice(n, n, replace=True)
-    else:
-        train_ind = rng.choice(
-            n, size=int(np.floor(split_percentage * n)), replace=False
-        )
-    valid_ind = np.array([ind for ind in range(n) if ind not in train_ind])
-
-    X_train, X_validation = X[train_ind], X[valid_ind]
-    y_train, y_validation = y[train_ind], y[valid_ind]
-
-    # Scaling X and y
-    X_train_scaled = X_train.copy()
-    X_validation_scaled = X_validation.copy()
-    X_scaled = X.copy()
-
-    if len(list_continuous) > 0:
-        X_train_scaled[:, list_continuous] = scaler_x.fit_transform(
-            X_train[:, list_continuous]
-        )
-        X_validation_scaled[:, list_continuous] = scaler_x.transform(
-            X_validation[:, list_continuous]
-        )
-        X_scaled[:, list_continuous] = scaler_x.transform(X[:, list_continuous])
-    if problem_type == "regression":
-        y_train_scaled = scaler_y.fit_transform(y_train)
-        y_validation_scaled = scaler_y.transform(y_validation)
-    else:
-        y_train_scaled = y_train.copy()
-        y_validation_scaled = y_validation.copy()
-
-    return (
-        X_train_scaled,
-        y_train_scaled,
-        X_validation_scaled,
-        y_validation_scaled,
-        X_scaled,
-        y_validation,
-        scaler_x,
-        scaler_y,
-        valid_ind,
-    )
-
-
-def _check_vim_predict_method(method):
-    """Check if the method is a valid method for variable importance measure
-    prediction"""
-    if method in ["predict", "predict_proba", "decision_function", "transform"]:
-        return method
-    else:
-        raise ValueError(
-            "The method {} is not a valid method for variable importance measure prediction".format(
-                method
-            )
-        )
\ No newline at end of file
+    return alpha_max
\ No newline at end of file

From bec739b99d04aef92c2b91a82a79f7ef8e03cf8c Mon Sep 17 00:00:00 2001
From: kusch lionel <lionel.a.kusch@inria.fr>
Date: Fri, 7 Mar 2025 11:01:34 +0100
Subject: [PATCH 18/19] Remove RF

---
 .../estimator/RandomForestModified.py         |  99 -----------------
 .../test_RandomForestModified.py              | 104 ------------------
 2 files changed, 203 deletions(-)
 delete mode 100644 src/hidimstat/estimator/RandomForestModified.py
 delete mode 100644 test/estimator_tests/test_RandomForestModified.py

diff --git a/src/hidimstat/estimator/RandomForestModified.py b/src/hidimstat/estimator/RandomForestModified.py
deleted file mode 100644
index 097e3d473..000000000
--- a/src/hidimstat/estimator/RandomForestModified.py
+++ /dev/null
@@ -1,99 +0,0 @@
-import numpy as np
-from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
-
-
-class RandomForestClassifierModified(RandomForestClassifier):
-    def fit(self, X, y):
-        self.y_ = y
-        return super().fit(X, y)
-
-    def predict(self, X):
-        return super().predict(X)
-
-    def predict_proba(self, X):
-        return super().predict_proba(X)
-
-    def sample_same_leaf(self, X, y=None):
-        if not (y is None):
-            self.y_ = y
-        rng = np.random.RandomState(self.get_params()["random_state"])
-        # The number of samples to sample from in the next step
-        random_samples = 10
-
-        # Get the leaf indices for each sample in the input data
-        leaf_indices = self.apply(X)
-
-        # Initialize an array to store the predictions for each sample
-        predictions = []
-
-        # Loop over each sample in the input data
-        for i in range(X.shape[0]):
-            # Removing the row of the corresponding input sample
-            leaf_indices_minus_i = np.delete(leaf_indices, i, axis=0)
-            y_minus_i = np.delete(self.y_, i, axis=0)
-
-            # The list of indices sampled from the same leaf of the input sample
-            leaf_samples = []
-            # Loop over each tree in the forest
-            for j in range(self.n_estimators):
-                # Find the samples that fall into the same leaf node for this tree
-                samples_in_leaf = np.where(
-                    leaf_indices_minus_i[:, j] == leaf_indices[i, j]
-                )[0]
-
-                # Append the samples to the list
-                if samples_in_leaf.size > 0:
-                    leaf_samples.append(
-                        y_minus_i[rng.choice(samples_in_leaf, size=random_samples)]
-                    )
-
-            predictions.append(leaf_samples)
-
-        # Combine the predictions from all trees to make the final prediction
-        return np.array(predictions, dtype=object)
-
-
-class RandomForestRegressorModified(RandomForestRegressor):
-    def fit(self, X, y):
-        self.y_ = y
-        return super().fit(X, y)
-
-    def predict(self, X):
-        return super().predict(X)
-
-    def sample_same_leaf(self, X):
-        rng = np.random.RandomState(self.get_params()["random_state"])
-        # The number of samples to sample from in the next step
-        random_samples = 10
-
-        # Get the leaf indices for each sample in the input data
-        leaf_indices = self.apply(X)
-
-        # Initialize an array to store the predictions for each sample
-        predictions = []
-
-        # Loop over each sample in the input data
-        for i in range(X.shape[0]):
-            # Removing the row of the corresponding input sample
-            leaf_indices_minus_i = np.delete(leaf_indices, i, axis=0)
-            y_minus_i = np.delete(self.y_, i, axis=0)
-
-            # The list of indices sampled from the same leaf of the input sample
-            leaf_samples = []
-            # Loop over each tree in the forest
-            for j in range(self.n_estimators):
-                # Find the samples that fall into the same leaf node for this tree
-                samples_in_leaf = np.where(
-                    leaf_indices_minus_i[:, j] == leaf_indices[i, j]
-                )[0]
-
-                # Append the samples to the list
-                if samples_in_leaf.size > 0:
-                    leaf_samples.append(
-                        y_minus_i[rng.choice(samples_in_leaf, size=random_samples)]
-                    )
-
-            predictions.append(leaf_samples)
-
-        # Combine the predictions from all trees to make the final prediction
-        return np.array(predictions, dtype=object)
diff --git a/test/estimator_tests/test_RandomForestModified.py b/test/estimator_tests/test_RandomForestModified.py
deleted file mode 100644
index 63bc84bd9..000000000
--- a/test/estimator_tests/test_RandomForestModified.py
+++ /dev/null
@@ -1,104 +0,0 @@
-from hidimstat.estimator.tests._utils_test import generate_data
-from hidimstat.estimator.RandomForestModified import (
-    RandomForestClassifierModified,
-    RandomForestRegressorModified,
-)
-import numpy as np
-
-
-def test_RandomForestRegressorModified():
-    """
-    Test the RandomForestRegressorModified for regression.
-    """
-    X, y = generate_data(problem_type="regression")
-    learner = RandomForestRegressorModified(n_jobs=10, verbose=0)
-    learner.fit(X, y)
-    predict = learner.predict(X)
-    # Check if the predicted values are close to the true values for at least one instance
-    assert np.max(np.abs(predict - y)) < 200.0
-    # Check if the predicted values are close to the true values for at least one instance
-    assert np.all(predict == y) or np.any(predict != y)
-    # Check if the predicted values are not all the same
-    assert not np.all(predict == predict[0])
-    # Check if the predicted values are not all zeros
-    assert not np.all(predict == 0)
-    # Check if the predicted values are not all ones
-    assert not np.all(predict == 1)
-    # Check if the feature importances are not all zeros
-    assert not np.all(learner.feature_importances_ == 0)
-    # Check if the feature importances are not all the same
-    assert not np.all(learner.feature_importances_ == learner.feature_importances_[0])
-    # Check if the feature importances are not all ones
-    assert not np.all(learner.feature_importances_ == 1)
-    # Check if the feature importances are not all negative
-    assert not np.all(learner.feature_importances_ < 0)
-    # # Check if the feature importances are not all positive
-    # assert not np.all(learner.feature_importances_ > 0)
-    # Check if the feature importances are not all close to zero
-    assert not np.allclose(learner.feature_importances_, 0)
-    # Check if the feature importances are not all close to one
-    assert not np.allclose(learner.feature_importances_, 1)
-
-    predictions = learner.sample_same_leaf(X)
-    # TODO: add more tests for sample_same_leaf
-
-
-def test_RandomForestClassifierModified():
-    """
-    Test the RandomForestClassifierModified for classification.
-    """
-    X, y = generate_data(problem_type="classification")
-    learner = RandomForestClassifierModified(n_jobs=10, verbose=0)
-    learner.fit(X, y)
-    predict_prob = learner.predict_proba(X)
-    # Check if the predicted probabilities sum up to 1 for each instance
-    assert np.allclose(np.sum(predict_prob, axis=1), 1)
-    # Check if the predicted class labels match the true labels for at least one instance
-    assert np.sum(np.argmax(predict_prob, axis=1) == y) > 0
-    assert np.all(np.max(predict_prob, axis=1) >= 0.5)
-    assert np.all(np.min(predict_prob, axis=1) < 0.5)
-    # Check if the maximum predicted probability is greater than 0.95
-    assert 0.95 < np.max(predict_prob)
-    # Check if the minimum predicted probability is less than 0.05
-    assert 0.05 > np.min(predict_prob)
-    # Check if the predicted probabilities are not all the same
-    assert not np.all(predict_prob == predict_prob[0])
-    # Check if the predicted probabilities are not all zeros
-    assert not np.all(predict_prob == 0)
-    # Check if the predicted probabilities are not all ones
-    assert not np.all(predict_prob == 1)
-    # Check if the predicted probabilities are not all the same for each class
-    assert not np.all(predict_prob[:, 0] == predict_prob[0, 0])
-    assert not np.all(predict_prob[:, 1] == predict_prob[0, 1])
-    # Check if the predicted probabilities are not all zeros for each class
-    assert not np.all(predict_prob[:, 0] == 0)
-    assert not np.all(predict_prob[:, 1] == 0)
-    # Check if the predicted probabilities are not all ones for each class
-    assert not np.all(predict_prob[:, 0] == 1)
-
-    predict = learner.predict(X)
-    # Check if the predicted values are close to the true values for at least one instance
-    assert np.all(predict == y) or np.any(predict != y)
-    # Check if the predicted values are not all the same
-    assert not np.all(predict == predict[0])
-    # Check if the predicted values are not all zeros
-    assert not np.all(predict == 0)
-    # Check if the predicted values are not all ones
-    assert not np.all(predict == 1)
-    # Check if the feature importances are not all zeros
-    assert not np.all(learner.feature_importances_ == 0)
-    # Check if the feature importances are not all the same
-    assert not np.all(learner.feature_importances_ == learner.feature_importances_[0])
-    # Check if the feature importances are not all ones
-    assert not np.all(learner.feature_importances_ == 1)
-    # Check if the feature importances are not all negative
-    assert not np.all(learner.feature_importances_ < 0)
-    # # Check if the feature importances are not all positive
-    # assert not np.all(learner.feature_importances_ > 0)
-    # Check if the feature importances are not all close to zero
-    assert not np.allclose(learner.feature_importances_, 0)
-    # Check if the feature importances are not all close to one
-    assert not np.allclose(learner.feature_importances_, 1)
-
-    predictions = learner.sample_same_leaf(X)
-    # TODO: add more tests for sample_same_leaf

From 7d3077221a5edc069206c887193f01a34fe26ed4 Mon Sep 17 00:00:00 2001
From: kusch lionel <lionel.a.kusch@inria.fr>
Date: Tue, 11 Mar 2025 13:07:17 +0100
Subject: [PATCH 19/19] Remove example link to RandomForest

---
 .../plot_residuals_sampling.py                | 237 ------------------
 1 file changed, 237 deletions(-)
 delete mode 100644 examples_not_exhibited/plot_residuals_sampling.py

diff --git a/examples_not_exhibited/plot_residuals_sampling.py b/examples_not_exhibited/plot_residuals_sampling.py
deleted file mode 100644
index ec30d6ec4..000000000
--- a/examples_not_exhibited/plot_residuals_sampling.py
+++ /dev/null
@@ -1,237 +0,0 @@
-"""
-Conditional sampling using residuals vs sampling Random Forest
-==============================================================
-
-To deploy the Conditional Permutation Importance (CPI),
-:footcite:t:`Chamma_NeurIPS2023` described two main approaches for the
-conditional scheme: 1) Instead of directly permuting the variable of interest as
-in the Permutation Feature Importance (PFI), the residuals of the prediction of
-the variable of interest x_j based on the remaining variables is first computed
-along with a predicted version x_hat_j. These residuals are shuffled and added
-to the predicted version to recreate the variable of interest (Preserving the
-dependency between the variable of interest and the remaining variables while
-breaking the relationship with the outcome). 2) Another option is to use the
-sampling Random Forest. Using the remaining variables to predict the variable of
-interest, and instead of predicting the variable of interest as the mean of the
-instances' outcome of the targeted leaf or the class with the most occurences,
-we sample from the same leaf of the instance of interest within its neighbors,
-and we follow the standard path of the Random Forest.
-
-References
-----------
-.. footbibliography::
-
-"""
-
-#############################################################################
-# Imports needed for this script
-# ------------------------------
-
-from hidimstat import BlockBasedImportance
-from joblib import Parallel, delayed
-import matplotlib.pyplot as plt
-import numpy as np
-import pandas as pd
-from scipy.linalg import cholesky
-from scipy.stats import norm
-from sklearn.ensemble import RandomForestRegressor
-from sklearn.metrics import roc_auc_score
-import time
-
-n, p = (100, 12)
-inter_cor, intra_cor = (0, 0.85)
-n_blocks = 1
-n_signal = 2
-problem_type = "regression"
-snr = 4
-rf = RandomForestRegressor(random_state=2023)
-dict_hyper = {"max_depth": [2, 5, 10, 20]}
-
-#############################################################################
-# Generate the synthetic data
-# ---------------------------
-# The function below generates the correlation matrix between the variables
-# according to the provided degrees of correlation (intra + inter). `inter_cor`
-# indicates the degree of correlation between the variables/groups whereas
-# `intra_cor` specifies the corresponding degree between the variables within
-# each group. For the single-level case, `n_blocks` is set to 1 and the
-# `intra_cor` is the unique correlation between variables.
-#
-# Next, we generate the synthetic data by randomly drawing n_signal predictors
-# from the corresponding p variables and reordering the set of variables to put the
-# n_signal predictors at the beginning. Following, the response is generated
-# under a simple linear model with Gaussian noise.
-
-
-def generate_cor_blocks(p, inter_cor, intra_cor, n_blocks):
-    vars_per_grp = int(p / n_blocks)
-    cor_mat = np.zeros((p, p))
-    cor_mat.fill(inter_cor)
-    for i in range(n_blocks):
-        cor_mat[
-            (i * vars_per_grp) : ((i + 1) * vars_per_grp),
-            (i * vars_per_grp) : ((i + 1) * vars_per_grp),
-        ] = intra_cor
-    np.fill_diagonal(cor_mat, 1)
-    return cor_mat
-
-
-def _generate_data(seed):
-    rng = np.random.RandomState(seed)
-
-    cor_mat = generate_cor_blocks(p, inter_cor, intra_cor, n_blocks)
-    x = norm.rvs(size=(p, n), random_state=seed)
-    c = cholesky(cor_mat, lower=True)
-    X = pd.DataFrame(np.dot(c, x).T, columns=[str(i) for i in np.arange(p)])
-
-    data = X.copy()
-
-    # Randomly draw n_signal predictors which are defined as signal predictors
-    indices_var = list(rng.choice(range(data.shape[1]), size=n_signal, replace=False))
-
-    # Reorder data matrix so that first n_signal predictors are the signal predictors
-    # List of remaining indices
-    indices_rem = [ind for ind in range(data.shape[1]) if ind not in indices_var]
-    total_indices = indices_var + indices_rem
-    # Before including the non-linear effects
-    data = data.iloc[:, total_indices]
-    data_signal = data.iloc[:, np.arange(n_signal)]
-
-    # Determine beta coefficients
-    effectset = [-0.5, -1, -2, -3, 0.5, 1, 2, 3]
-    beta = rng.choice(effectset, size=data_signal.shape[1], replace=True)
-
-    # Generate response
-    # The product of the signal predictors with the beta coefficients
-    prod_signal = np.dot(data_signal, beta)
-
-    sigma_noise = np.linalg.norm(prod_signal, ord=2) / (
-        snr * np.sqrt(data_signal.shape[0])
-    )
-    y = prod_signal + sigma_noise * rng.normal(size=prod_signal.shape[0])
-
-    return data, y
-
-
-#############################################################################
-# Processing across multiple permutations
-# ---------------------------------------
-# In order to get statistical significance with p-values, we run the experiments
-# across 10 repetitions.
-#
-
-
-def compute_simulations(seed):
-    X, y = _generate_data(seed)
-    # Using the residuals
-    start_residuals = time.time()
-    bbi_residual = BlockBasedImportance(
-        estimator="RF",
-        importance_estimator="residuals_RF",
-        do_hypertuning=True,
-        dict_hypertuning=None,
-        conditional=True,
-        n_permutations=10,
-        n_jobs=2,
-        problem_type="regression",
-        k_fold=2,
-        variables_categories={},
-    )
-    bbi_residual.fit(X, y)
-    results_bbi_residual = bbi_residual.compute_importance()
-
-    df_residuals = {}
-    df_residuals["method"] = ["residuals"] * X.shape[1]
-    df_residuals["score"] = [results_bbi_residual["score_R2"]] * X.shape[1]
-    df_residuals["elapsed"] = [time.time() - start_residuals] * X.shape[1]
-    df_residuals["importance"] = np.ravel(results_bbi_residual["importance"])
-    df_residuals["p-value"] = np.ravel(results_bbi_residual["pval"])
-    df_residuals["iteration"] = [seed] * X.shape[1]
-    df_residuals = pd.DataFrame(df_residuals)
-
-    # Using the sampling RF
-    start_sampling = time.time()
-    bbi_sampling = BlockBasedImportance(
-        estimator="RF",
-        importance_estimator="sampling_RF",
-        do_hypertuning=True,
-        dict_hypertuning=None,
-        conditional=True,
-        n_permutations=10,
-        n_jobs=2,
-        problem_type="regression",
-        k_fold=2,
-        variables_categories={},
-    )
-    bbi_sampling.fit(X, y)
-    results_bbi_sampling = bbi_sampling.compute_importance()
-
-    df_sampling = {}
-    df_sampling["method"] = ["sampling"] * X.shape[1]
-    df_sampling["score"] = [results_bbi_sampling["score_R2"]] * X.shape[1]
-    df_sampling["elapsed"] = [time.time() - start_sampling] * X.shape[1]
-    df_sampling["importance"] = np.ravel(results_bbi_sampling["importance"])
-    df_sampling["p-value"] = np.ravel(results_bbi_sampling["pval"])
-    df_sampling["iteration"] = [seed] * X.shape[1]
-    df_sampling = pd.DataFrame(df_sampling)
-
-    df_final = pd.concat([df_residuals, df_sampling], axis=0)
-    return df_final
-
-
-# Running across 10 repetitions
-parallel = Parallel(n_jobs=2, verbose=1)
-final_result = parallel(
-    delayed(compute_simulations)(seed=seed) for seed in np.arange(1, 11)
-)
-
-#############################################################################
-# Plotting AUC score and Type-I error
-# -----------------------------------
-# With the prediction problems turns to be a binary classification problem for
-# the variables being relevant or non-relevant vs the ground-truth, we measure
-# the performance in terms of type-I error i.e. the rate of true non-relevant
-# variables detected as relevant and AUC score related to correct significant
-# variables ordering.
-#
-
-df_final_result = pd.concat(final_result, axis=0).reset_index(drop=True)
-df_auc = df_final_result.groupby(by=["method", "iteration"]).apply(
-    lambda x: roc_auc_score([1] * n_signal + [0] * (p - n_signal), -x["p-value"])
-)
-df_auc = df_auc.reset_index(name="auc")
-df_type_I = df_final_result.groupby(by=["method", "iteration"]).apply(
-    lambda x: sum(x.iloc[n_signal:, :]["p-value"] <= 0.05) / x.iloc[2:, :].shape[0]
-)
-df_type_I = df_type_I.reset_index(name="type-I")
-
-auc = [
-    np.array(df_auc["auc"])[: int(df_auc.shape[0] / 2)],
-    np.array(df_auc["auc"])[int(df_auc.shape[0] / 2) :],
-]
-typeI_error = [
-    np.array(df_type_I["type-I"])[: int(df_type_I.shape[0] / 2)],
-    np.array(df_type_I["type-I"])[int(df_type_I.shape[0] / 2) :],
-]
-
-fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(8, 4), sharey=True)
-
-# AUC score
-axs[0].violinplot(auc, showmeans=False, showmedians=True, vert=False)
-axs[0].set_title("AUC score")
-axs[0].xaxis.grid(True)
-axs[0].set_yticks([x + 1 for x in range(len(auc))], labels=["Residuals", "Sampling"])
-axs[0].set_ylabel("Method")
-
-# Type-I Error
-axs[1].violinplot(typeI_error, showmeans=False, showmedians=True, vert=False)
-axs[1].set_title("Type-I Error")
-axs[1].axvline(x=0.05, color="r", label="Nominal Rate")
-plt.show()
-
-#############################################################################
-# Analysis of the results
-# -----------------------
-# We can observe that the sampling approaches'performance is almost similar to
-# that of the residuals. Sampling accelerates the conditional importance
-# computation by simplifying the residuals steps.