From 717330b2887c994f38d7bc6d1f081148ce15537a Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 13 Dec 2024 14:28:19 +0100 Subject: [PATCH 01/19] Create a folder with the estimators --- src/hidimstat/__init__.py | 2 -- src/hidimstat/{ => estimator}/Dnn_learner.py | 0 src/hidimstat/{ => estimator}/Dnn_learner_single.py | 0 src/hidimstat/{ => estimator}/RandomForestModified.py | 0 src/hidimstat/estimator/__init__.py | 5 +++++ 5 files changed, 5 insertions(+), 2 deletions(-) rename src/hidimstat/{ => estimator}/Dnn_learner.py (100%) rename src/hidimstat/{ => estimator}/Dnn_learner_single.py (100%) rename src/hidimstat/{ => estimator}/RandomForestModified.py (100%) create mode 100644 src/hidimstat/estimator/__init__.py diff --git a/src/hidimstat/__init__.py b/src/hidimstat/__init__.py index f8114696a..4a8cdcb33 100644 --- a/src/hidimstat/__init__.py +++ b/src/hidimstat/__init__.py @@ -1,6 +1,5 @@ from .clustered_inference import clustered_inference, hd_inference from .desparsified_lasso import desparsified_group_lasso, desparsified_lasso -from .Dnn_learner_single import DnnLearnerSingle from .ensemble_clustered_inference import ensemble_clustered_inference from .knockoff_aggregation import knockoff_aggregation from .knockoffs import model_x_knockoff @@ -25,7 +24,6 @@ "dcrt_zero", "desparsified_lasso", "desparsified_group_lasso", - "DnnLearnerSingle", "ensemble_clustered_inference", "group_reid", "hd_inference", diff --git a/src/hidimstat/Dnn_learner.py b/src/hidimstat/estimator/Dnn_learner.py similarity index 100% rename from src/hidimstat/Dnn_learner.py rename to src/hidimstat/estimator/Dnn_learner.py diff --git a/src/hidimstat/Dnn_learner_single.py b/src/hidimstat/estimator/Dnn_learner_single.py similarity index 100% rename from src/hidimstat/Dnn_learner_single.py rename to src/hidimstat/estimator/Dnn_learner_single.py diff --git a/src/hidimstat/RandomForestModified.py b/src/hidimstat/estimator/RandomForestModified.py similarity index 100% rename from src/hidimstat/RandomForestModified.py rename to src/hidimstat/estimator/RandomForestModified.py diff --git a/src/hidimstat/estimator/__init__.py b/src/hidimstat/estimator/__init__.py new file mode 100644 index 000000000..afbfacd38 --- /dev/null +++ b/src/hidimstat/estimator/__init__.py @@ -0,0 +1,5 @@ +from .Dnn_learner_single import DnnLearnerSingle + +__all__ = [ + "DnnLearnerSingle", +] From 99e1092722f1d0aa64d9c54af71eb452e7e61682 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 13 Dec 2024 14:29:42 +0100 Subject: [PATCH 02/19] Transfer of function from global utils to local utils --- hidimstat/estimator/_utils/u_Dnn_learner.py | 657 ++++++++++++++++++++ src/hidimstat/utils.py | 610 +----------------- 2 files changed, 658 insertions(+), 609 deletions(-) create mode 100644 hidimstat/estimator/_utils/u_Dnn_learner.py diff --git a/hidimstat/estimator/_utils/u_Dnn_learner.py b/hidimstat/estimator/_utils/u_Dnn_learner.py new file mode 100644 index 000000000..150e3f262 --- /dev/null +++ b/hidimstat/estimator/_utils/u_Dnn_learner.py @@ -0,0 +1,657 @@ + +import numpy as np +import copy +import torch +import torch.nn as nn +import torch.nn.functional as F +from torchmetrics import Accuracy +from sklearn.metrics import log_loss, mean_squared_error +from sklearn.preprocessing import StandardScaler + + + +def create_X_y( + X, + y, + sampling_with_repetition=True, + split_percentage=0.8, + problem_type="regression", + list_continuous=None, + random_state=None, +): + """ + Create train/valid split of input data X and target variable y + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The input samples before the splitting process. + y : ndarray, shape (n_samples, ) + The output samples before the splitting process. + sampling_with_repetition : bool, default=True + Sampling with repetition the train part of the train/valid scheme under + the training set. The number of training samples in train is equal to + the number of instances in the training set. + split_percentage : float, default=0.8 + The training/validation cut for the provided data. + problem_type : str, default='regression' + A classification or a regression problem. + list_continuous : list, default=[] + The list of continuous variables. + random_state : int, default=2023 + Fixing the seeds of the random generator. + + Returns + ------- + X_train_scaled : {array-like, sparse matrix} of shape (n_train_samples, n_features) + The sampling_with_repetitionped training input samples with scaled continuous variables. + y_train_scaled : {array-like} of shape (n_train_samples, ) + The sampling_with_repetitionped training output samples scaled if continous. + X_validation_scaled : {array-like, sparse matrix} of shape (n_validation_samples, n_features) + The validation input samples with scaled continuous variables. + y_validation_scaled : {array-like} of shape (n_validation_samples, ) + The validation output samples scaled if continous. + X_scaled : {array-like, sparse matrix} of shape (n_samples, n_features) + The original input samples with scaled continuous variables. + y_validation : {array-like} of shape (n_samples, ) + The original output samples with validation indices. + scaler_x : Scikit-learn StandardScaler + The standard scaler encoder for the continuous variables of the input. + scaler_y : Scikit-learn StandardScaler + The standard scaler encoder for the output if continuous. + valid_ind : list + The list of indices of the validation set. + """ + rng = np.random.RandomState(random_state) + scaler_x, scaler_y = StandardScaler(), StandardScaler() + n = X.shape[0] + + if sampling_with_repetition: + train_ind = rng.choice(n, n, replace=True) + else: + train_ind = rng.choice( + n, size=int(np.floor(split_percentage * n)), replace=False + ) + valid_ind = np.array([ind for ind in range(n) if ind not in train_ind]) + + X_train, X_validation = X[train_ind], X[valid_ind] + y_train, y_validation = y[train_ind], y[valid_ind] + + # Scaling X and y + X_train_scaled = X_train.copy() + X_validation_scaled = X_validation.copy() + X_scaled = X.copy() + + if len(list_continuous) > 0: + X_train_scaled[:, list_continuous] = scaler_x.fit_transform( + X_train[:, list_continuous] + ) + X_validation_scaled[:, list_continuous] = scaler_x.transform( + X_validation[:, list_continuous] + ) + X_scaled[:, list_continuous] = scaler_x.transform(X[:, list_continuous]) + if problem_type == "regression": + y_train_scaled = scaler_y.fit_transform(y_train) + y_validation_scaled = scaler_y.transform(y_validation) + else: + y_train_scaled = y_train.copy() + y_validation_scaled = y_validation.copy() + + return ( + X_train_scaled, + y_train_scaled, + X_validation_scaled, + y_validation_scaled, + X_scaled, + y_validation, + scaler_x, + scaler_y, + valid_ind, + ) + + +def sigmoid(x): + """ + This function applies the sigmoid function element-wise to the input array x + """ + return 1 / (1 + np.exp(-x)) + + +def softmax(x): + """ + This function applies the softmax function element-wise to the input array x + """ + # Ensure numerical stability by subtracting the maximum value of x from each element of x + # This prevents overflow errors when exponentiating large numbers + x = x - np.max(x, axis=-1, keepdims=True) + exp_x = np.exp(x) + return exp_x / np.sum(exp_x, axis=-1, keepdims=True) + + +def relu(x): + """ + This function applies the relu function element-wise to the input array x + """ + return (abs(x) + x) / 2 + + +def ordinal_encode(y): + """ + This function encodes the ordinal variable with a special gradual encoding storing also + the natural order information. + """ + list_y = [] + for y_col in range(y.shape[-1]): + # Retrieve the unique values + unique_vals = np.unique(y[:, y_col]) + # Mapping each unique value to its corresponding index + mapping_dict = {} + for i, val in enumerate(unique_vals): + mapping_dict[val] = i + 1 + # create a zero-filled array for the ordinal encoding + y_ordinal = np.zeros((len(y[:, y_col]), len(set(y[:, y_col])))) + # set the appropriate indices to 1 for each ordinal value and all lower ordinal values + for ind_el, el in enumerate(y[:, y_col]): + y_ordinal[ind_el, np.arange(mapping_dict[el])] = 1 + list_y.append(y_ordinal[:, 1:]) + + return list_y + + +def joblib_ensemble_dnnet( + X, + y, + problem_type="regression", + activation_outcome=None, + list_continuous=None, + list_grps=None, + sampling_with_repetition=False, + split_percentage=0.8, + group_stacking=False, + input_dimensions=None, + n_epoch=200, + batch_size=32, + beta1=0.9, + beta2=0.999, + lr=1e-3, + l1_weight=1e-2, + l2_weight=1e-2, + epsilon=1e-8, + random_state=None, +): + """ + This function implements the ensemble learning of the sub-DNN models + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_train_samples, n_features) + The input samples. + y : array-like of shape (n_train_samples,) or (n_train_samples, n_outputs) + The target values (class labels in classification, real numbers in + regression). + problem_type : str, default='regression' + A classification or a regression problem. + activation_outcome : str, default=None + The activation function to apply in the outcome layer, "softmax" for + classification and "sigmoid" for both ordinal and binary cases. + list_continuous : list, default=None + The list of continuous variables. + list_grps : list of lists, default=None + A list collecting the indices of the groups' variables + while applying the stacking method. + sampling_with_repetition : bool, default=True + Application of sampling_with_repetition sampling for the training set. + split_percentage : float, default=0.8 + The training/validation cut for the provided data. + group_stacking : bool, default=False + Apply the stacking-based method for the provided groups. + input_dimensions : list, default=None + The cumsum of inputs after the linear sub-layers. + n_epoch : int, default=200 + The number of epochs for the DNN learner(s). + batch_size : int, default=32 + The number of samples per batch for training. + beta1 : float, default=0.9 + The exponential decay rate for the first moment estimates. + beta2 : float, default=0.999 + The exponential decay rate for the second moment estimates. + lr : float, default=1e-3 + The learning rate. + l1_weight : float, default=1e-2 + The L1-regularization paramter for weight decay. + l2_weight : float, default=0 + The L2-regularization paramter for weight decay. + epsilon : float, default=1e-8 + A small constant added to the denominator to prevent division by zero. + random_state : int, default=2023 + Fixing the seeds of the random generator. + + Returns + ------- + current_model : list + The parameters of the sub-DNN model + scaler_x : list of Scikit-learn StandardScalers + The scalers for the continuous input variables. + scaler_y : Scikit-learn StandardScaler + The scaler for the continuous output variable. + pred_v : ndarray + The predictions of the sub-DNN model. + loss : float + The loss score of the sub-DNN model. + """ + + pred_v = np.empty(X.shape[0]) + # Sampling and Train/Validate splitting + ( + X_train_scaled, + y_train_scaled, + X_validation_scaled, + y_validation_scaled, + X_scaled, + y_validation, + scaler_x, + scaler_y, + valid_ind, + ) = create_X_y( + X, + y, + sampling_with_repetition=sampling_with_repetition, + split_percentage=split_percentage, + problem_type=problem_type, + list_continuous=list_continuous, + random_state=random_state, + ) + + current_model = dnn_net( + X_train_scaled, + y_train_scaled, + X_validation_scaled, + y_validation_scaled, + problem_type=problem_type, + n_epoch=n_epoch, + batch_size=batch_size, + beta1=beta1, + beta2=beta2, + lr=lr, + l1_weight=l1_weight, + l2_weight=l2_weight, + epsilon=epsilon, + list_grps=list_grps, + group_stacking=group_stacking, + input_dimensions=input_dimensions, + random_state=random_state, + ) + + if not group_stacking: + X_scaled_n = X_scaled.copy() + else: + X_scaled_n = np.zeros((X_scaled.shape[0], input_dimensions[-1])) + for grp_ind in range(len(list_grps)): + n_layer_stacking = len(current_model[3][grp_ind]) - 1 + curr_pred = X_scaled[:, list_grps[grp_ind]].copy() + for ind_w_b in range(n_layer_stacking): + if ind_w_b == 0: + curr_pred = relu( + X_scaled[:, list_grps[grp_ind]].dot( + current_model[3][grp_ind][ind_w_b] + ) + + current_model[4][grp_ind][ind_w_b] + ) + else: + curr_pred = relu( + curr_pred.dot(current_model[3][grp_ind][ind_w_b]) + + current_model[4][grp_ind][ind_w_b] + ) + X_scaled_n[ + :, + list( + np.arange(input_dimensions[grp_ind], input_dimensions[grp_ind + 1]) + ), + ] = ( + curr_pred.dot(current_model[3][grp_ind][n_layer_stacking]) + + current_model[4][grp_ind][n_layer_stacking] + ) + + n_layer = len(current_model[0]) - 1 + for j in range(n_layer): + if j == 0: + pred = relu(X_scaled_n.dot(current_model[0][j]) + current_model[1][j]) + else: + pred = relu(pred.dot(current_model[0][j]) + current_model[1][j]) + + pred = pred.dot(current_model[0][n_layer]) + current_model[1][n_layer] + + if problem_type not in ("classification", "binary"): + if problem_type != "ordinal": + pred_v = pred * scaler_y.scale_ + scaler_y.mean_ + else: + pred_v = activation_outcome[problem_type](pred) + loss = np.std(y_validation) ** 2 - mean_squared_error( + y_validation, pred_v[valid_ind] + ) + else: + pred_v = activation_outcome[problem_type](pred) + loss = log_loss( + y_validation, np.ones(y_validation.shape) * np.mean(y_validation, axis=0) + ) - log_loss(y_validation, pred_v[valid_ind]) + + return (current_model, scaler_x, scaler_y, pred_v, loss) + + +def _initialize_weights(layer): + if isinstance(layer, nn.Linear): + layer.weight.data = (layer.weight.data.uniform_() - 0.5) * 0.2 + layer.bias.data = (layer.bias.data.uniform_() - 0.5) * 0.1 + + +def _dataset_Loader(X, y, shuffle=False, batch_size=50): + if y.shape[-1] == 2: + y = y[:, [1]] + dataset = torch.utils.data.TensorDataset( + torch.from_numpy(X).float(), torch.from_numpy(y).float() + ) + + loader = torch.utils.data.DataLoader( + dataset, batch_size=batch_size, shuffle=shuffle + ) + return loader + + +class DNN(nn.Module): + """ + Feedfoward Neural Network with 4 hidden layers + """ + + def __init__( + self, input_dim, group_stacking, list_grps, output_dimension, problem_type + ): + super().__init__() + if problem_type == "classification": + self.accuracy = Accuracy(task="multiclass", num_classes=output_dimension) + else: + self.accuracy = Accuracy(task="binary") + self.list_grps = list_grps + self.group_stacking = group_stacking + if group_stacking: + self.layers_stacking = nn.ModuleList( + [ + nn.Linear( + in_features=len(grp), + out_features=input_dim[grp_ind + 1] - input_dim[grp_ind], + ) + # nn.Sequential( + # nn.Linear( + # in_features=len(grp), + # # out_features=max(1, int(0.1 * len(grp))), + # out_features=input_dim[grp_ind + 1] + # - input_dim[grp_ind], + # ), + # nn.ReLU(), + # nn.Linear( + # in_features=max(1, int(0.1 * len(grp))), + # out_features=input_dim[grp_ind + 1] + # - input_dim[grp_ind], + # ), + # nn.ReLU(), + # nn.Linear( + # in_features=max(1, int(0.1 * len(grp))), + # out_features=input_dim[grp_ind + 1] + # - input_dim[grp_ind], + # ), + # ) + for grp_ind, grp in enumerate(list_grps) + ] + ) + input_dim = input_dim[-1] + self.layers = nn.Sequential( + # hidden layers + nn.Linear(input_dim, 50), + nn.ReLU(), + nn.Linear(50, 40), + nn.ReLU(), + nn.Linear(40, 30), + nn.ReLU(), + nn.Linear(30, 20), + nn.ReLU(), + # output layer + nn.Linear(20, output_dimension), + ) + self.loss = 0 + + def forward(self, x): + if self.group_stacking: + list_stacking = [None] * len(self.layers_stacking) + for ind_layer, layer in enumerate(self.layers_stacking): + list_stacking[ind_layer] = layer(x[:, self.list_grps[ind_layer]]) + x = torch.cat(list_stacking, dim=1) + return self.layers(x) + + def training_step(self, batch, device, problem_type): + X, y = batch[0].to(device), batch[1].to(device) + y_pred = self(X) # Generate predictions + if problem_type == "regression": + loss = F.mse_loss(y_pred, y) + elif problem_type == "classification": + loss = F.cross_entropy(y_pred, y) # Calculate loss + else: + loss = F.binary_cross_entropy_with_logits(y_pred, y) + return loss + + def validation_step(self, batch, device, problem_type): + X, y = batch[0].to(device), batch[1].to(device) + y_pred = self(X) # Generate predictions + if problem_type == "regression": + loss = F.mse_loss(y_pred, y) + return { + "val_mse": loss, + "batch_size": len(X), + } + else: + if problem_type == "classification": + loss = F.cross_entropy(y_pred, y) # Calculate loss + else: + loss = F.binary_cross_entropy_with_logits(y_pred, y) + acc = self.accuracy(y_pred, y.int()) + return { + "val_loss": loss, + "val_acc": acc, + "batch_size": len(X), + } + + def validation_epoch_end(self, outputs, problem_type): + if problem_type in ("classification", "binary", "ordinal"): + batch_losses = [] + batch_accs = [] + batch_sizes = [] + for x in outputs: + batch_losses.append(x["val_loss"] * x["batch_size"]) + batch_accs.append(x["val_acc"] * x["batch_size"]) + batch_sizes.append(x["batch_size"]) + self.loss = torch.stack(batch_losses).sum().item() / np.sum( + batch_sizes + ) # Combine losses + epoch_acc = torch.stack(batch_accs).sum().item() / np.sum( + batch_sizes + ) # Combine accuracies + return {"val_loss": self.loss, "val_acc": epoch_acc} + else: + batch_losses = [x["val_mse"] * x["batch_size"] for x in outputs] + batch_sizes = [x["batch_size"] for x in outputs] + self.loss = torch.stack(batch_losses).sum().item() / np.sum( + batch_sizes + ) # Combine losses + return {"val_mse": self.loss} + + def epoch_end(self, epoch, result): + if len(result) == 2: + print( + "Epoch [{}], val_loss: {:.4f}, val_acc: {:.4f}".format( + epoch + 1, result["val_loss"], result["val_acc"] + ) + ) + else: + print("Epoch [{}], val_mse: {:.4f}".format(epoch + 1, result["val_mse"])) + + +def _evaluate(model, loader, device, problem_type): + outputs = [model.validation_step(batch, device, problem_type) for batch in loader] + return model.validation_epoch_end(outputs, problem_type) + + +def dnn_net( + X_train, + y_train, + X_validation, + y_validation, + problem_type="regression", + n_epoch=200, + batch_size=32, + batch_size_validation=128, + beta1=0.9, + beta2=0.999, + lr=1e-3, + l1_weight=1e-2, + l2_weight=1e-2, + epsilon=1e-8, + list_grps=None, + group_stacking=False, + input_dimensions=None, + random_state=2023, + verbose=0, +): + """ + This function implements the training/validation process of the sub-DNN + models + + Parameters + ---------- + X_train : {array-like, sparse matrix} of shape (n_train_samples, n_features) + The training input samples. + y_train : {array-like} of shape (n_train_samples, ) + The training output samples. + X_validation : {array-like, sparse matrix} of shape (n_validation_samples, n_features) + The validation input samples. + y_validation : {array-like} of shape (n_validation_samples, ) + The validation output samples. + problem_type : str, default='regression' + A classification or a regression problem. + n_epoch : int, default=200 + The number of epochs for the DNN learner(s). + batch_size : int, default=32 + The number of samples per batch for training. + batch_size_validation : int, default=128 + The number of samples per batch for validation. + beta1 : float, default=0.9 + The exponential decay rate for the first moment estimates. + beta2 : float, default=0.999 + The exponential decay rate for the second moment estimates. + lr : float, default=1e-3 + The learning rate. + l1_weight : float, default=1e-2 + The L1-regularization paramter for weight decay. + l2_weight : float, default=0 + The L2-regularization paramter for weight decay. + epsilon : float, default=1e-8 + A small constant added to the denominator to prevent division by zero. + list_grps : list of lists, default=None + A list collecting the indices of the groups' variables + while applying the stacking method. + group_stacking : bool, default=False + Apply the stacking-based method for the provided groups. + input_dimensions : list, default=None + The cumsum of inputs after the linear sub-layers. + random_state : int, default=2023 + Fixing the seeds of the random generator. + verbose : int, default=0 + If verbose > 0, the fitted iterations will be printed. + """ + # Creating DataLoaders + train_loader = _dataset_Loader( + X_train, + y_train, + shuffle=True, + batch_size=batch_size, + ) + validate_loader = _dataset_Loader( + X_validation, y_validation, batch_size=batch_size_validation + ) + # Set the seed for PyTorch's random number generator + torch.manual_seed(random_state) + + # Set the seed for PyTorch's CUDA random number generator(s), if available + if torch.cuda.is_available(): + torch.cuda.manual_seed(random_state) + torch.cuda.manual_seed_all(random_state) + + # Specify whether to use GPU or CPU + # device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + device = torch.device("cpu") + + if problem_type in ("regression", "binary"): + output_dimension = 1 + else: + output_dimension = y_train.shape[-1] + + # DNN model + input_dim = input_dimensions.copy() if group_stacking else X_train.shape[1] + model = DNN(input_dim, group_stacking, list_grps, output_dimension, problem_type) + model.to(device) + # Initializing weights/bias + model.apply(_initialize_weights) + # Adam Optimizer + optimizer = torch.optim.Adam( + model.parameters(), lr=lr, betas=(beta1, beta2), eps=epsilon + ) + + best_loss = 1e100 + for epoch in range(n_epoch): + # Training Phase + model.train() + for batch in train_loader: + optimizer.zero_grad() + loss = model.training_step(batch, device, problem_type) + + loss.backward() + optimizer.step() + for name, param in model.named_parameters(): + if "bias" not in name: + # if name.split(".")[0] == "layers_stacking": + # param.data -= l2_weight * param.data + # else: + param.data -= ( + l1_weight * torch.sign(param.data) + l2_weight * param.data + ) + # Validation Phase + model.eval() + result = _evaluate(model, validate_loader, device, problem_type) + if model.loss < best_loss: + best_loss = model.loss + dict_params = copy.deepcopy(model.state_dict()) + if verbose >= 2: + model.epoch_end(epoch, result) + + best_weight = [] + best_bias = [] + best_weight_stack = [[].copy() for _ in range(len(list_grps))] + best_bias_stack = [[].copy() for _ in range(len(list_grps))] + + for name, param in dict_params.items(): + if name.split(".")[0] == "layers": + if name.split(".")[-1] == "weight": + best_weight.append(param.numpy().T) + if name.split(".")[-1] == "bias": + best_bias.append(param.numpy()[np.newaxis, :]) + if name.split(".")[0] == "layers_stacking": + curr_ind = int(name.split(".")[1]) + if name.split(".")[-1] == "weight": + best_weight_stack[curr_ind].append(param.numpy().T) + if name.split(".")[-1] == "bias": + best_bias_stack[curr_ind].append(param.numpy()[np.newaxis, :]) + + return [ + best_weight, + best_bias, + best_loss, + best_weight_stack, + best_bias_stack, + ] \ No newline at end of file diff --git a/src/hidimstat/utils.py b/src/hidimstat/utils.py index 14f4d5ef3..2c112a000 100644 --- a/src/hidimstat/utils.py +++ b/src/hidimstat/utils.py @@ -1,11 +1,5 @@ import copy import numpy as np -import torch -import torch.nn as nn -import torch.nn.functional as F -from sklearn.metrics import log_loss, mean_squared_error -from sklearn.preprocessing import StandardScaler -from torchmetrics import Accuracy ########################## quantile aggregation method ########################## @@ -446,606 +440,4 @@ def _check_vim_predict_method(method): "The method {} is not a valid method for variable importance measure prediction".format( method ) - ) - - -def sigmoid(x): - """ - This function applies the sigmoid function element-wise to the input array x - """ - return 1 / (1 + np.exp(-x)) - - -def softmax(x): - """ - This function applies the softmax function element-wise to the input array x - """ - # Ensure numerical stability by subtracting the maximum value of x from each element of x - # This prevents overflow errors when exponentiating large numbers - x = x - np.max(x, axis=-1, keepdims=True) - exp_x = np.exp(x) - return exp_x / np.sum(exp_x, axis=-1, keepdims=True) - - -def relu(x): - """ - This function applies the relu function element-wise to the input array x - """ - return (abs(x) + x) / 2 - - -def relu_(x): - """ - This function applies the derivative of the relu function element-wise - to the input array x - """ - return (x > 0) * 1 - - -def convert_predict_proba(list_probs): - """ - If the classification is done using a one-hot encoded variable, the list of - probabilites will be a list of lists for the probabilities of each of the categories. - This function takes the probabilities of having each category (=1 with binary) and stack - them into one ndarray. - """ - if len(list_probs.shape) == 3: - list_probs = np.array(list_probs)[..., 1].T - return list_probs - - -def ordinal_encode(y): - """ - This function encodes the ordinal variable with a special gradual encoding storing also - the natural order information. - """ - list_y = [] - for y_col in range(y.shape[-1]): - # Retrieve the unique values - unique_vals = np.unique(y[:, y_col]) - # Mapping each unique value to its corresponding index - mapping_dict = {} - for i, val in enumerate(unique_vals): - mapping_dict[val] = i + 1 - # create a zero-filled array for the ordinal encoding - y_ordinal = np.zeros((len(y[:, y_col]), len(set(y[:, y_col])))) - # set the appropriate indices to 1 for each ordinal value and all lower ordinal values - for ind_el, el in enumerate(y[:, y_col]): - y_ordinal[ind_el, np.arange(mapping_dict[el])] = 1 - list_y.append(y_ordinal[:, 1:]) - - return list_y - - -def sample_predictions(predictions, random_state=None): - """ - This function samples from the same leaf node of the input sample - in both the regression and the classification cases - """ - rng = np.random.RandomState(random_state) - # print(predictions[..., rng.randint(predictions.shape[2]), :]) - # print(predictions.shape) - # exit(0) - return predictions[..., rng.randint(predictions.shape[2]), :] - - -def joblib_ensemble_dnnet( - X, - y, - problem_type="regression", - activation_outcome=None, - list_continuous=None, - list_grps=None, - sampling_with_repetition=False, - split_percentage=0.8, - group_stacking=False, - input_dimensions=None, - n_epoch=200, - batch_size=32, - beta1=0.9, - beta2=0.999, - lr=1e-3, - l1_weight=1e-2, - l2_weight=1e-2, - epsilon=1e-8, - random_state=None, -): - """ - This function implements the ensemble learning of the sub-DNN models - - Parameters - ---------- - X : {array-like, sparse matrix} of shape (n_train_samples, n_features) - The input samples. - y : array-like of shape (n_train_samples,) or (n_train_samples, n_outputs) - The target values (class labels in classification, real numbers in - regression). - problem_type : str, default='regression' - A classification or a regression problem. - activation_outcome : str, default=None - The activation function to apply in the outcome layer, "softmax" for - classification and "sigmoid" for both ordinal and binary cases. - list_continuous : list, default=None - The list of continuous variables. - list_grps : list of lists, default=None - A list collecting the indices of the groups' variables - while applying the stacking method. - sampling_with_repetition : bool, default=True - Application of sampling_with_repetition sampling for the training set. - split_percentage : float, default=0.8 - The training/validation cut for the provided data. - group_stacking : bool, default=False - Apply the stacking-based method for the provided groups. - input_dimensions : list, default=None - The cumsum of inputs after the linear sub-layers. - n_epoch : int, default=200 - The number of epochs for the DNN learner(s). - batch_size : int, default=32 - The number of samples per batch for training. - beta1 : float, default=0.9 - The exponential decay rate for the first moment estimates. - beta2 : float, default=0.999 - The exponential decay rate for the second moment estimates. - lr : float, default=1e-3 - The learning rate. - l1_weight : float, default=1e-2 - The L1-regularization paramter for weight decay. - l2_weight : float, default=0 - The L2-regularization paramter for weight decay. - epsilon : float, default=1e-8 - A small constant added to the denominator to prevent division by zero. - random_state : int, default=2023 - Fixing the seeds of the random generator. - - Returns - ------- - current_model : list - The parameters of the sub-DNN model - scaler_x : list of Scikit-learn StandardScalers - The scalers for the continuous input variables. - scaler_y : Scikit-learn StandardScaler - The scaler for the continuous output variable. - pred_v : ndarray - The predictions of the sub-DNN model. - loss : float - The loss score of the sub-DNN model. - """ - - pred_v = np.empty(X.shape[0]) - # Sampling and Train/Validate splitting - ( - X_train_scaled, - y_train_scaled, - X_validation_scaled, - y_validation_scaled, - X_scaled, - y_validation, - scaler_x, - scaler_y, - valid_ind, - ) = create_X_y( - X, - y, - sampling_with_repetition=sampling_with_repetition, - split_percentage=split_percentage, - problem_type=problem_type, - list_continuous=list_continuous, - random_state=random_state, - ) - - current_model = dnn_net( - X_train_scaled, - y_train_scaled, - X_validation_scaled, - y_validation_scaled, - problem_type=problem_type, - n_epoch=n_epoch, - batch_size=batch_size, - beta1=beta1, - beta2=beta2, - lr=lr, - l1_weight=l1_weight, - l2_weight=l2_weight, - epsilon=epsilon, - list_grps=list_grps, - group_stacking=group_stacking, - input_dimensions=input_dimensions, - random_state=random_state, - ) - - if not group_stacking: - X_scaled_n = X_scaled.copy() - else: - X_scaled_n = np.zeros((X_scaled.shape[0], input_dimensions[-1])) - for grp_ind in range(len(list_grps)): - n_layer_stacking = len(current_model[3][grp_ind]) - 1 - curr_pred = X_scaled[:, list_grps[grp_ind]].copy() - for ind_w_b in range(n_layer_stacking): - if ind_w_b == 0: - curr_pred = relu( - X_scaled[:, list_grps[grp_ind]].dot( - current_model[3][grp_ind][ind_w_b] - ) - + current_model[4][grp_ind][ind_w_b] - ) - else: - curr_pred = relu( - curr_pred.dot(current_model[3][grp_ind][ind_w_b]) - + current_model[4][grp_ind][ind_w_b] - ) - X_scaled_n[ - :, - list( - np.arange(input_dimensions[grp_ind], input_dimensions[grp_ind + 1]) - ), - ] = ( - curr_pred.dot(current_model[3][grp_ind][n_layer_stacking]) - + current_model[4][grp_ind][n_layer_stacking] - ) - - n_layer = len(current_model[0]) - 1 - for j in range(n_layer): - if j == 0: - pred = relu(X_scaled_n.dot(current_model[0][j]) + current_model[1][j]) - else: - pred = relu(pred.dot(current_model[0][j]) + current_model[1][j]) - - pred = pred.dot(current_model[0][n_layer]) + current_model[1][n_layer] - - if problem_type not in ("classification", "binary"): - if problem_type != "ordinal": - pred_v = pred * scaler_y.scale_ + scaler_y.mean_ - else: - pred_v = activation_outcome[problem_type](pred) - loss = np.std(y_validation) ** 2 - mean_squared_error( - y_validation, pred_v[valid_ind] - ) - else: - pred_v = activation_outcome[problem_type](pred) - loss = log_loss( - y_validation, np.ones(y_validation.shape) * np.mean(y_validation, axis=0) - ) - log_loss(y_validation, pred_v[valid_ind]) - - return (current_model, scaler_x, scaler_y, pred_v, loss) - - -def initialize_weights(layer): - if isinstance(layer, nn.Linear): - layer.weight.data = (layer.weight.data.uniform_() - 0.5) * 0.2 - layer.bias.data = (layer.bias.data.uniform_() - 0.5) * 0.1 - - -def Dataset_Loader(X, y, shuffle=False, batch_size=50): - if y.shape[-1] == 2: - y = y[:, [1]] - dataset = torch.utils.data.TensorDataset( - torch.from_numpy(X).float(), torch.from_numpy(y).float() - ) - - loader = torch.utils.data.DataLoader( - dataset, batch_size=batch_size, shuffle=shuffle - ) - return loader - - -class DNN(nn.Module): - """ - Feedfoward Neural Network with 4 hidden layers - """ - - def __init__( - self, input_dim, group_stacking, list_grps, output_dimension, problem_type - ): - super().__init__() - if problem_type == "classification": - self.accuracy = Accuracy(task="multiclass", num_classes=output_dimension) - else: - self.accuracy = Accuracy(task="binary") - self.list_grps = list_grps - self.group_stacking = group_stacking - if group_stacking: - self.layers_stacking = nn.ModuleList( - [ - nn.Linear( - in_features=len(grp), - out_features=input_dim[grp_ind + 1] - input_dim[grp_ind], - ) - # nn.Sequential( - # nn.Linear( - # in_features=len(grp), - # # out_features=max(1, int(0.1 * len(grp))), - # out_features=input_dim[grp_ind + 1] - # - input_dim[grp_ind], - # ), - # nn.ReLU(), - # nn.Linear( - # in_features=max(1, int(0.1 * len(grp))), - # out_features=input_dim[grp_ind + 1] - # - input_dim[grp_ind], - # ), - # nn.ReLU(), - # nn.Linear( - # in_features=max(1, int(0.1 * len(grp))), - # out_features=input_dim[grp_ind + 1] - # - input_dim[grp_ind], - # ), - # ) - for grp_ind, grp in enumerate(list_grps) - ] - ) - input_dim = input_dim[-1] - self.layers = nn.Sequential( - # hidden layers - nn.Linear(input_dim, 50), - nn.ReLU(), - nn.Linear(50, 40), - nn.ReLU(), - nn.Linear(40, 30), - nn.ReLU(), - nn.Linear(30, 20), - nn.ReLU(), - # output layer - nn.Linear(20, output_dimension), - ) - self.loss = 0 - - def forward(self, x): - if self.group_stacking: - list_stacking = [None] * len(self.layers_stacking) - for ind_layer, layer in enumerate(self.layers_stacking): - list_stacking[ind_layer] = layer(x[:, self.list_grps[ind_layer]]) - x = torch.cat(list_stacking, dim=1) - return self.layers(x) - - def training_step(self, batch, device, problem_type): - X, y = batch[0].to(device), batch[1].to(device) - y_pred = self(X) # Generate predictions - if problem_type == "regression": - loss = F.mse_loss(y_pred, y) - elif problem_type == "classification": - loss = F.cross_entropy(y_pred, y) # Calculate loss - else: - loss = F.binary_cross_entropy_with_logits(y_pred, y) - return loss - - def validation_step(self, batch, device, problem_type): - X, y = batch[0].to(device), batch[1].to(device) - y_pred = self(X) # Generate predictions - if problem_type == "regression": - loss = F.mse_loss(y_pred, y) - return { - "val_mse": loss, - "batch_size": len(X), - } - else: - if problem_type == "classification": - loss = F.cross_entropy(y_pred, y) # Calculate loss - else: - loss = F.binary_cross_entropy_with_logits(y_pred, y) - acc = self.accuracy(y_pred, y.int()) - return { - "val_loss": loss, - "val_acc": acc, - "batch_size": len(X), - } - - def validation_epoch_end(self, outputs, problem_type): - if problem_type in ("classification", "binary"): - batch_losses = [] - batch_accs = [] - batch_sizes = [] - for x in outputs: - batch_losses.append(x["val_loss"] * x["batch_size"]) - batch_accs.append(x["val_acc"] * x["batch_size"]) - batch_sizes.append(x["batch_size"]) - self.loss = torch.stack(batch_losses).sum().item() / np.sum( - batch_sizes - ) # Combine losses - epoch_acc = torch.stack(batch_accs).sum().item() / np.sum( - batch_sizes - ) # Combine accuracies - return {"val_loss": self.loss, "val_acc": epoch_acc} - else: - batch_losses = [x["val_mse"] * x["batch_size"] for x in outputs] - batch_sizes = [x["batch_size"] for x in outputs] - self.loss = torch.stack(batch_losses).sum().item() / np.sum( - batch_sizes - ) # Combine losses - return {"val_mse": self.loss} - - def epoch_end(self, epoch, result): - if len(result) == 2: - print( - "Epoch [{}], val_loss: {:.4f}, val_acc: {:.4f}".format( - epoch + 1, result["val_loss"], result["val_acc"] - ) - ) - else: - print("Epoch [{}], val_mse: {:.4f}".format(epoch + 1, result["val_mse"])) - - -def evaluate(model, loader, device, problem_type): - outputs = [model.validation_step(batch, device, problem_type) for batch in loader] - return model.validation_epoch_end(outputs, problem_type) - - -def dnn_net( - X_train, - y_train, - X_validation, - y_validation, - problem_type="regression", - n_epoch=200, - batch_size=32, - batch_size_validation=128, - beta1=0.9, - beta2=0.999, - lr=1e-3, - l1_weight=1e-2, - l2_weight=1e-2, - epsilon=1e-8, - list_grps=None, - group_stacking=False, - input_dimensions=None, - random_state=2023, - verbose=0, -): - """ - This function implements the training/validation process of the sub-DNN - models - - Parameters - ---------- - X_train : {array-like, sparse matrix} of shape (n_train_samples, n_features) - The training input samples. - y_train : {array-like} of shape (n_train_samples, ) - The training output samples. - X_validation : {array-like, sparse matrix} of shape (n_validation_samples, n_features) - The validation input samples. - y_validation : {array-like} of shape (n_validation_samples, ) - The validation output samples. - problem_type : str, default='regression' - A classification or a regression problem. - n_epoch : int, default=200 - The number of epochs for the DNN learner(s). - batch_size : int, default=32 - The number of samples per batch for training. - batch_size_validation : int, default=128 - The number of samples per batch for validation. - beta1 : float, default=0.9 - The exponential decay rate for the first moment estimates. - beta2 : float, default=0.999 - The exponential decay rate for the second moment estimates. - lr : float, default=1e-3 - The learning rate. - l1_weight : float, default=1e-2 - The L1-regularization paramter for weight decay. - l2_weight : float, default=0 - The L2-regularization paramter for weight decay. - epsilon : float, default=1e-8 - A small constant added to the denominator to prevent division by zero. - list_grps : list of lists, default=None - A list collecting the indices of the groups' variables - while applying the stacking method. - group_stacking : bool, default=False - Apply the stacking-based method for the provided groups. - input_dimensions : list, default=None - The cumsum of inputs after the linear sub-layers. - random_state : int, default=2023 - Fixing the seeds of the random generator. - verbose : int, default=0 - If verbose > 0, the fitted iterations will be printed. - """ - # Creating DataLoaders - train_loader = Dataset_Loader( - X_train, - y_train, - shuffle=True, - batch_size=batch_size, - ) - validate_loader = Dataset_Loader( - X_validation, y_validation, batch_size=batch_size_validation - ) - # Set the seed for PyTorch's random number generator - torch.manual_seed(random_state) - - # Set the seed for PyTorch's CUDA random number generator(s), if available - if torch.cuda.is_available(): - torch.cuda.manual_seed(random_state) - torch.cuda.manual_seed_all(random_state) - - # Specify whether to use GPU or CPU - # device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - device = torch.device("cpu") - - if problem_type in ("regression", "binary"): - output_dimension = 1 - else: - output_dimension = y_train.shape[-1] - - # DNN model - input_dim = input_dimensions.copy() if group_stacking else X_train.shape[1] - model = DNN(input_dim, group_stacking, list_grps, output_dimension, problem_type) - model.to(device) - # Initializing weights/bias - model.apply(initialize_weights) - # Adam Optimizer - optimizer = torch.optim.Adam( - model.parameters(), lr=lr, betas=(beta1, beta2), eps=epsilon - ) - - best_loss = 1e100 - for epoch in range(n_epoch): - # Training Phase - model.train() - for batch in train_loader: - optimizer.zero_grad() - loss = model.training_step(batch, device, problem_type) - - loss.backward() - optimizer.step() - for name, param in model.named_parameters(): - if "bias" not in name: - # if name.split(".")[0] == "layers_stacking": - # param.data -= l2_weight * param.data - # else: - param.data -= ( - l1_weight * torch.sign(param.data) + l2_weight * param.data - ) - # Validation Phase - model.eval() - result = evaluate(model, validate_loader, device, problem_type) - if model.loss < best_loss: - best_loss = model.loss - dict_params = copy.deepcopy(model.state_dict()) - if verbose >= 2: - model.epoch_end(epoch, result) - - best_weight = [] - best_bias = [] - best_weight_stack = [[].copy() for _ in range(len(list_grps))] - best_bias_stack = [[].copy() for _ in range(len(list_grps))] - - for name, param in dict_params.items(): - if name.split(".")[0] == "layers": - if name.split(".")[-1] == "weight": - best_weight.append(param.numpy().T) - if name.split(".")[-1] == "bias": - best_bias.append(param.numpy()[np.newaxis, :]) - if name.split(".")[0] == "layers_stacking": - curr_ind = int(name.split(".")[1]) - if name.split(".")[-1] == "weight": - best_weight_stack[curr_ind].append(param.numpy().T) - if name.split(".")[-1] == "bias": - best_bias_stack[curr_ind].append(param.numpy()[np.newaxis, :]) - - return [ - best_weight, - best_bias, - best_loss, - best_weight_stack, - best_bias_stack, - ] - - -def compute_imp_std(pred_scores): - weights = np.array([el.shape[-2] for el in pred_scores]) - # Compute the mean of each fold over the number of observations - pred_mean = np.array([np.mean(el.copy(), axis=-2) for el in pred_scores]) - - # Weighted average - imp = np.average(pred_mean, axis=0, weights=weights) - - # Compute the standard deviation of each fold - # over the number of observations - pred_std = np.array( - [ - np.mean( - (el - imp[..., np.newaxis]) ** 2, - axis=-2, - ) - for el in pred_scores - ] - ) - std = np.sqrt(np.average(pred_std, axis=0, weights=weights) / (np.sum(weights) - 1)) - return (imp, std) + ) \ No newline at end of file From 1e546e01207591764c519cb74d4e872c169c0d55 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 13 Dec 2024 14:31:50 +0100 Subject: [PATCH 03/19] Modifier Random forest for using their last function --- .../test/test_RandomForestModified.py | 102 ++++++++++++++++++ .../estimator/RandomForestModified.py | 28 ++--- 2 files changed, 117 insertions(+), 13 deletions(-) create mode 100644 hidimstat/estimator/test/test_RandomForestModified.py diff --git a/hidimstat/estimator/test/test_RandomForestModified.py b/hidimstat/estimator/test/test_RandomForestModified.py new file mode 100644 index 000000000..e625316e3 --- /dev/null +++ b/hidimstat/estimator/test/test_RandomForestModified.py @@ -0,0 +1,102 @@ +from hidimstat.estimator.test._utils_test import generate_data +from hidimstat.estimator.RandomForestModified import RandomForestClassifierModified, RandomForestRegressorModified +import numpy as np + + +def test_RandomForestRegressorModified(): + """ + Test the RandomForestRegressorModified for regression. + Parameters: + - regression_data: A tuple containing the input features (X) and target variable (y) for regression. + """ + X, y = generate_data(problem_type="regression") + learner = RandomForestRegressorModified(n_jobs=10, verbose=0) + learner.fit(X, y) + predict = learner.predict(X) + # Check if the predicted values are close to the true values for at least one instance + assert np.max(np.abs(predict-y)) < 200.0 + # Check if the predicted values are close to the true values for at least one instance + assert np.all(predict == y) or np.any(predict != y) + # Check if the predicted values are not all the same + assert not np.all(predict == predict[0]) + # Check if the predicted values are not all zeros + assert not np.all(predict == 0) + # Check if the predicted values are not all ones + assert not np.all(predict == 1) + # Check if the feature importances are not all zeros + assert not np.all(learner.feature_importances_ == 0) + # Check if the feature importances are not all the same + assert not np.all(learner.feature_importances_ == learner.feature_importances_[0]) + # Check if the feature importances are not all ones + assert not np.all(learner.feature_importances_ == 1) + # Check if the feature importances are not all negative + assert not np.all(learner.feature_importances_ < 0) + # # Check if the feature importances are not all positive + # assert not np.all(learner.feature_importances_ > 0) + # Check if the feature importances are not all close to zero + assert not np.allclose(learner.feature_importances_, 0) + # Check if the feature importances are not all close to one + assert not np.allclose(learner.feature_importances_, 1) + + predictions = learner.sample_same_leaf(X) + #TODO: add more tests for sample_same_leaf + +def test_RandomForestClassifierModified(): + """ + Test the RandomForestClassifierModified for classification. + """ + X, y = generate_data(problem_type="classification") + learner = RandomForestClassifierModified(n_jobs=10, verbose=0) + learner.fit(X, y) + predict_prob = learner.predict_proba(X) + # Check if the predicted probabilities sum up to 1 for each instance + assert np.allclose(np.sum(predict_prob, axis=1), 1) + # Check if the predicted class labels match the true labels for at least one instance + assert np.sum(np.argmax(predict_prob, axis=1) == y) > 0 + assert np.all(np.max(predict_prob, axis=1) >= 0.5) + assert np.all(np.min(predict_prob, axis=1) < 0.5) + # Check if the maximum predicted probability is greater than 0.95 + assert 0.95 < np.max(predict_prob) + # Check if the minimum predicted probability is less than 0.05 + assert 0.05 > np.min(predict_prob) + # Check if the predicted probabilities are not all the same + assert not np.all(predict_prob == predict_prob[0]) + # Check if the predicted probabilities are not all zeros + assert not np.all(predict_prob == 0) + # Check if the predicted probabilities are not all ones + assert not np.all(predict_prob == 1) + # Check if the predicted probabilities are not all the same for each class + assert not np.all(predict_prob[:, 0] == predict_prob[0, 0]) + assert not np.all(predict_prob[:, 1] == predict_prob[0, 1]) + # Check if the predicted probabilities are not all zeros for each class + assert not np.all(predict_prob[:, 0] == 0) + assert not np.all(predict_prob[:, 1] == 0) + # Check if the predicted probabilities are not all ones for each class + assert not np.all(predict_prob[:, 0] == 1) + + predict = learner.predict(X) + # Check if the predicted values are close to the true values for at least one instance + assert np.all(predict == y) or np.any(predict != y) + # Check if the predicted values are not all the same + assert not np.all(predict == predict[0]) + # Check if the predicted values are not all zeros + assert not np.all(predict == 0) + # Check if the predicted values are not all ones + assert not np.all(predict == 1) + # Check if the feature importances are not all zeros + assert not np.all(learner.feature_importances_ == 0) + # Check if the feature importances are not all the same + assert not np.all(learner.feature_importances_ == learner.feature_importances_[0]) + # Check if the feature importances are not all ones + assert not np.all(learner.feature_importances_ == 1) + # Check if the feature importances are not all negative + assert not np.all(learner.feature_importances_ < 0) + # # Check if the feature importances are not all positive + # assert not np.all(learner.feature_importances_ > 0) + # Check if the feature importances are not all close to zero + assert not np.allclose(learner.feature_importances_, 0) + # Check if the feature importances are not all close to one + assert not np.allclose(learner.feature_importances_, 1) + + predictions = learner.sample_same_leaf(X) + #TODO: add more tests for sample_same_leaf \ No newline at end of file diff --git a/src/hidimstat/estimator/RandomForestModified.py b/src/hidimstat/estimator/RandomForestModified.py index 7dd5b12a1..097e3d473 100644 --- a/src/hidimstat/estimator/RandomForestModified.py +++ b/src/hidimstat/estimator/RandomForestModified.py @@ -5,13 +5,13 @@ class RandomForestClassifierModified(RandomForestClassifier): def fit(self, X, y): self.y_ = y - super().fit(X, y) + return super().fit(X, y) def predict(self, X): - super().predict(X) + return super().predict(X) def predict_proba(self, X): - super().predict_proba(X) + return super().predict_proba(X) def sample_same_leaf(self, X, y=None): if not (y is None): @@ -42,23 +42,24 @@ def sample_same_leaf(self, X, y=None): )[0] # Append the samples to the list - leaf_samples.append( - y_minus_i[rng.choice(samples_in_leaf, size=random_samples)] - ) + if samples_in_leaf.size > 0: + leaf_samples.append( + y_minus_i[rng.choice(samples_in_leaf, size=random_samples)] + ) predictions.append(leaf_samples) # Combine the predictions from all trees to make the final prediction - return np.array(predictions) + return np.array(predictions, dtype=object) class RandomForestRegressorModified(RandomForestRegressor): def fit(self, X, y): self.y_ = y - super().fit(X, y) + return super().fit(X, y) def predict(self, X): - super().predict(X) + return super().predict(X) def sample_same_leaf(self, X): rng = np.random.RandomState(self.get_params()["random_state"]) @@ -87,11 +88,12 @@ def sample_same_leaf(self, X): )[0] # Append the samples to the list - leaf_samples.append( - y_minus_i[rng.choice(samples_in_leaf, size=random_samples)] - ) + if samples_in_leaf.size > 0: + leaf_samples.append( + y_minus_i[rng.choice(samples_in_leaf, size=random_samples)] + ) predictions.append(leaf_samples) # Combine the predictions from all trees to make the final prediction - return np.array(predictions) + return np.array(predictions, dtype=object) From c72c6fa9e330f1ea64289ac96655f4a5da6a4c71 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 13 Dec 2024 14:33:28 +0100 Subject: [PATCH 04/19] Add test for DNN learner --- hidimstat/estimator/test/_utils_test.py | 54 ++++++++++++++++++ hidimstat/estimator/test/test_Dnn_learner.py | 58 ++++++++++++++++++++ 2 files changed, 112 insertions(+) create mode 100644 hidimstat/estimator/test/_utils_test.py create mode 100644 hidimstat/estimator/test/test_Dnn_learner.py diff --git a/hidimstat/estimator/test/_utils_test.py b/hidimstat/estimator/test/_utils_test.py new file mode 100644 index 000000000..a29e849ff --- /dev/null +++ b/hidimstat/estimator/test/_utils_test.py @@ -0,0 +1,54 @@ +from sklearn.datasets import make_classification, make_regression +import numpy as np +import pandas as pd +import pytest + + +def generate_data( + n_samples=200, + n_features=10, + problem_type="regression", + seed=2024, +): + """ + This function generates the synthetic data used in the different tests. + ---------- + n_samples : int, optional + Number of samples to generate, by default 200 + n_features : int, optional + Number of features to generate, by default 10 + problem_type : str, optional + Type of problem to generate, by default "regression" (options: "regression", "classification") + seed : int, optional + Random seed, by default 2024 + ---------- + Returns + ------- + X : pd.DataFrame + Data matrix + y : np.array + Target vector + grps : np.array + Group vector + """ + rng = np.random.default_rng(seed) + if problem_type == "regression": + X, y = make_regression( + n_samples=n_samples, + noise=0.2, + n_features=n_features, + random_state=seed, + ) + else: + X, y = make_classification( + n_samples=n_samples, + n_classes=2, + n_informative=5, + n_features=n_features, + random_state=seed, + ) + #y = np.array([str(i) for i in y]) + + X = pd.DataFrame(X, columns=[f"col{i+1}" for i in range(n_features)]) + + return X, y \ No newline at end of file diff --git a/hidimstat/estimator/test/test_Dnn_learner.py b/hidimstat/estimator/test/test_Dnn_learner.py new file mode 100644 index 000000000..8f131bde5 --- /dev/null +++ b/hidimstat/estimator/test/test_Dnn_learner.py @@ -0,0 +1,58 @@ +from hidimstat.estimator.test._utils_test import generate_data +from hidimstat.estimator.Dnn_learner import DnnLearner +import numpy as np + + +def test_DNN_regression(): + """ + Test the DNN learner for regression. + """ + X, y = generate_data(problem_type="regression") + learner = DnnLearner(do_hypertuning=True, problem_type="regression", n_jobs=10, verbose=0) + learner.fit(X, np.expand_dims(y, axis=1)) + predict = learner.predict(X)[0, :, 0] + # Check if the predicted values are close to the true values for at least one instance + assert np.max(np.abs(predict-y)) < 4.0 + # Check if the predicted values are close to the true values for at least one instance + assert np.all(predict == y) or np.any(predict != y) + # Check if the predicted values are not all the same + assert not np.all(predict == predict[0]) + # Check if the predicted values are not all zeros + assert not np.all(predict == 0) + # Check if the predicted values are not all ones + assert not np.all(predict == 1) + + +def test_DNN_classification(): + """ + Test the DNN learner for classification. + """ + X, y = generate_data(problem_type="classification") + learner = DnnLearner(do_hypertuning=True, problem_type="classification", n_jobs=10, verbose=0) + learner.fit(X, np.expand_dims(y, axis=1)) + predict_prob = learner.predict_proba(X) + # Check if the predicted probabilities sum up to 1 for each instance + assert np.allclose(np.sum(predict_prob, axis=1), 1) + # Check if the predicted class labels match the true labels for at least one instance + assert np.sum(np.argmax(predict_prob, axis=1) == y) > 0 + assert np.all(np.max(predict_prob, axis=1) >= 0.5) + assert np.all(np.min(predict_prob, axis=1) < 0.5) + # Check if the maximum predicted probability is greater than 0.95 + assert 0.95 < np.max(predict_prob) + # Check if the minimum predicted probability is less than 0.05 + assert 0.05 > np.min(predict_prob) + # Check if the predicted probabilities are not all the same + assert not np.all(predict_prob == predict_prob[0]) + # Check if the predicted probabilities are not all zeros + assert not np.all(predict_prob == 0) + # Check if the predicted probabilities are not all ones + assert not np.all(predict_prob == 1) + # Check if the predicted probabilities are not all the same for each class + assert not np.all(predict_prob[:, 0] == predict_prob[0, 0]) + assert not np.all(predict_prob[:, 1] == predict_prob[0, 1]) + # Check if the predicted probabilities are not all zeros for each class + assert not np.all(predict_prob[:, 0] == 0) + assert not np.all(predict_prob[:, 1] == 0) + # Check if the predicted probabilities are not all ones for each class + assert not np.all(predict_prob[:, 0] == 1) + From 5bbbc036c48936ea59871359ac468dac9961b74a Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 13 Dec 2024 14:33:58 +0100 Subject: [PATCH 05/19] Add test for Dnn_learner_single and fix some bugs --- .../estimator/test/test_Dnn_learner_single.py | 103 ++++++++++++++++++ src/hidimstat/estimator/Dnn_learner_single.py | 14 +-- 2 files changed, 109 insertions(+), 8 deletions(-) create mode 100644 hidimstat/estimator/test/test_Dnn_learner_single.py diff --git a/hidimstat/estimator/test/test_Dnn_learner_single.py b/hidimstat/estimator/test/test_Dnn_learner_single.py new file mode 100644 index 000000000..4f28b51de --- /dev/null +++ b/hidimstat/estimator/test/test_Dnn_learner_single.py @@ -0,0 +1,103 @@ +from hidimstat.estimator.test._utils_test import generate_data +from hidimstat.estimator.Dnn_learner_single import DnnLearnerSingle +import numpy as np + + +def test_DNN_single_regression(): + """ + Test the DNN learner single for regression. + """ + X, y = generate_data(problem_type="regression") + learner = DnnLearnerSingle(do_hypertuning=True, problem_type="regression", n_jobs=10, verbose=0) + learner.fit(X, y) + predict = learner.predict(X)[:,0] + # Check if the predicted values are close to the true values for at least one instance + assert np.max(np.abs(predict-y)) < 4.0 + # Check if the predicted values are close to the true values for at least one instance + assert np.all(predict == y) or np.any(predict != y) + # Check if the predicted values are not all the same + assert not np.all(predict == predict[0]) + # Check if the predicted values are not all zeros + assert not np.all(predict == 0) + # Check if the predicted values are not all ones + assert not np.all(predict == 1) + + +def test_DNN_single_classification(): + """ + Test the DNN learner single for classification. + """ + X, y = generate_data(problem_type="classification") + learner = DnnLearnerSingle(do_hypertuning=True, problem_type="classification", n_jobs=10, verbose=0) + learner.fit(X, y) + predict_prob = learner.predict_proba(X) + # Check if the predicted probabilities sum up to 1 for each instance + assert np.allclose(np.sum(predict_prob, axis=1), 1) + # Check if the predicted class labels match the true labels for at least one instance + assert np.sum(np.argmax(predict_prob, axis=1) == y) > 0 + assert np.all(np.max(predict_prob, axis=1) >= 0.5) + assert np.all(np.min(predict_prob, axis=1) < 0.5) + # Check if the maximum predicted probability is greater than 0.95 + assert 0.95 < np.max(predict_prob) + # Check if the minimum predicted probability is less than 0.05 + assert 0.05 > np.min(predict_prob) + # Check if the predicted probabilities are not all the same + assert not np.all(predict_prob == predict_prob[0]) + # Check if the predicted probabilities are not all zeros + assert not np.all(predict_prob == 0) + # Check if the predicted probabilities are not all ones + assert not np.all(predict_prob == 1) + # Check if the predicted probabilities are not all the same for each class + assert not np.all(predict_prob[:, 0] == predict_prob[0, 0]) + assert not np.all(predict_prob[:, 1] == predict_prob[0, 1]) + # Check if the predicted probabilities are not all zeros for each class + assert not np.all(predict_prob[:, 0] == 0) + assert not np.all(predict_prob[:, 1] == 0) + # Check if the predicted probabilities are not all ones for each class + assert not np.all(predict_prob[:, 0] == 1) + + +def test_DNN_single_binary(): + """ + Test the DNN learner single for binary classification. + """ + X, y = generate_data(problem_type="classification") + learner = DnnLearnerSingle(do_hypertuning=True, problem_type="binary", n_jobs=10, verbose=0) + learner.fit(X, y) + predict_prob = learner.predict_proba(X) + # Check if the predicted probabilities sum up to 1 for each instance + assert np.allclose(np.sum(predict_prob, axis=1), 1) + # Check if the predicted class labels match the true labels for at least one instance + assert np.sum(np.argmax(predict_prob, axis=1) == y) > 0 + assert np.all(np.max(predict_prob, axis=1) >= 0.5) + assert np.all(np.min(predict_prob, axis=1) < 0.5) + # Check if the maximum predicted probability is greater than 0.95 + assert 0.95 < np.max(predict_prob) + # Check if the minimum predicted probability is less than 0.05 + assert 0.05 > np.min(predict_prob) + # Check if the predicted probabilities are not all the same + assert not np.all(predict_prob == predict_prob[0]) + # Check if the predicted probabilities are not all zeros + assert not np.all(predict_prob == 0) + # Check if the predicted probabilities are not all ones + assert not np.all(predict_prob == 1) + # Check if the predicted probabilities are not all the same for each class + assert not np.all(predict_prob[:, 0] == predict_prob[0, 0]) + assert not np.all(predict_prob[:, 1] == predict_prob[0, 1]) + # Check if the predicted probabilities are not all zeros for each class + assert not np.all(predict_prob[:, 0] == 0) + assert not np.all(predict_prob[:, 1] == 0) + # Check if the predicted probabilities are not all ones for each class + assert not np.all(predict_prob[:, 0] == 1) + + +def test_DNN_single_ordinal(): + """ + Test the DNN learner single for ordinal. + """ + X, y = generate_data(problem_type="classification") + learner = DnnLearnerSingle(do_hypertuning=True, problem_type="ordinal", n_jobs=10, verbose=0) + learner.fit(X, y) + predict_prob = learner.predict_proba(X)[:,0] + # Check if the predicted class labels match the true labels for at least one instance + #assert np.sum(np.abs((np.around(predict_prob)[:,0]-y))) == 0 \ No newline at end of file diff --git a/src/hidimstat/estimator/Dnn_learner_single.py b/src/hidimstat/estimator/Dnn_learner_single.py index ddffe5bb8..7b2ee8a45 100644 --- a/src/hidimstat/estimator/Dnn_learner_single.py +++ b/src/hidimstat/estimator/Dnn_learner_single.py @@ -8,7 +8,7 @@ from sklearn.preprocessing import OneHotEncoder from sklearn.utils.validation import check_is_fitted -from .utils import ( +from ._utils.u_Dnn_learner import ( create_X_y, dnn_net, joblib_ensemble_dnnet, @@ -241,6 +241,7 @@ def fit(self, X, y=None): loss = np.array(res_ens[4]) if self.n_ensemble == 1: + raise Warning("The model can't be fit with n_ensemble = 1") return [(res_ens[0][0], (res_ens[1][0], res_ens[2][0]))] # Keeping the optimal subset of DNNs @@ -283,6 +284,9 @@ def encode_outcome(self, y, train=True): y = y.reshape(-1, 1) if self.problem_type == "regression": list_y.append(y) + # Encoding the target with the ordinal case + if self.problem_type == "ordinal": + list_y = ordinal_encode(y) for col in range(y.shape[1]): if train: @@ -291,18 +295,12 @@ def encode_outcome(self, y, train=True): self.enc_y.append(OneHotEncoder(handle_unknown="ignore")) curr_y = self.enc_y[col].fit_transform(y[:, [col]]).toarray() list_y.append(curr_y) - - # Encoding the target with the ordinal case - if self.problem_type == "ordinal": - y = ordinal_encode(y) - else: # Encoding the target with the classification case if self.problem_type in ("classification", "binary"): curr_y = self.enc_y[col].transform(y[:, [col]]).toarray() list_y.append(curr_y) - - ## ToDo Add the ordinal case + return np.array(list_y) def hyper_tuning( From 8a806f4e8a52ca19cadacb905f85dc68bc6c92e5 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 13 Dec 2024 14:35:46 +0100 Subject: [PATCH 06/19] Ignore coverage files --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index a13abe82b..425d3aba2 100644 --- a/.gitignore +++ b/.gitignore @@ -9,7 +9,7 @@ joblib *.pyc __pycache__ *.egg-info -.coverage +.coverage* # documenation doc From 8fc0e8220049e501404b3f2e809bf1eff6ebe621 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 13 Dec 2024 14:59:44 +0100 Subject: [PATCH 07/19] Missing a file for test --- hidimstat/estimator/test/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 hidimstat/estimator/test/__init__.py diff --git a/hidimstat/estimator/test/__init__.py b/hidimstat/estimator/test/__init__.py new file mode 100644 index 000000000..e69de29bb From d91f78ba03734e85eb7ac057a1998376beb8fb15 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 13 Dec 2024 16:25:36 +0100 Subject: [PATCH 08/19] Modified coverage configuration for new tests --- hidimstat/estimator/{test => tests}/__init__.py | 0 hidimstat/estimator/{test => tests}/_utils_test.py | 0 hidimstat/estimator/{test => tests}/test_Dnn_learner.py | 0 hidimstat/estimator/{test => tests}/test_Dnn_learner_single.py | 0 hidimstat/estimator/{test => tests}/test_RandomForestModified.py | 0 5 files changed, 0 insertions(+), 0 deletions(-) rename hidimstat/estimator/{test => tests}/__init__.py (100%) rename hidimstat/estimator/{test => tests}/_utils_test.py (100%) rename hidimstat/estimator/{test => tests}/test_Dnn_learner.py (100%) rename hidimstat/estimator/{test => tests}/test_Dnn_learner_single.py (100%) rename hidimstat/estimator/{test => tests}/test_RandomForestModified.py (100%) diff --git a/hidimstat/estimator/test/__init__.py b/hidimstat/estimator/tests/__init__.py similarity index 100% rename from hidimstat/estimator/test/__init__.py rename to hidimstat/estimator/tests/__init__.py diff --git a/hidimstat/estimator/test/_utils_test.py b/hidimstat/estimator/tests/_utils_test.py similarity index 100% rename from hidimstat/estimator/test/_utils_test.py rename to hidimstat/estimator/tests/_utils_test.py diff --git a/hidimstat/estimator/test/test_Dnn_learner.py b/hidimstat/estimator/tests/test_Dnn_learner.py similarity index 100% rename from hidimstat/estimator/test/test_Dnn_learner.py rename to hidimstat/estimator/tests/test_Dnn_learner.py diff --git a/hidimstat/estimator/test/test_Dnn_learner_single.py b/hidimstat/estimator/tests/test_Dnn_learner_single.py similarity index 100% rename from hidimstat/estimator/test/test_Dnn_learner_single.py rename to hidimstat/estimator/tests/test_Dnn_learner_single.py diff --git a/hidimstat/estimator/test/test_RandomForestModified.py b/hidimstat/estimator/tests/test_RandomForestModified.py similarity index 100% rename from hidimstat/estimator/test/test_RandomForestModified.py rename to hidimstat/estimator/tests/test_RandomForestModified.py From d032aa25966930f8ac146719492b153eda7e212b Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 16 Dec 2024 10:37:35 +0100 Subject: [PATCH 09/19] Fix bug in the import of test --- hidimstat/estimator/tests/test_Dnn_learner.py | 2 +- hidimstat/estimator/tests/test_Dnn_learner_single.py | 2 +- hidimstat/estimator/tests/test_RandomForestModified.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/hidimstat/estimator/tests/test_Dnn_learner.py b/hidimstat/estimator/tests/test_Dnn_learner.py index 8f131bde5..e8dad389f 100644 --- a/hidimstat/estimator/tests/test_Dnn_learner.py +++ b/hidimstat/estimator/tests/test_Dnn_learner.py @@ -1,4 +1,4 @@ -from hidimstat.estimator.test._utils_test import generate_data +from hidimstat.estimator.tests._utils_test import generate_data from hidimstat.estimator.Dnn_learner import DnnLearner import numpy as np diff --git a/hidimstat/estimator/tests/test_Dnn_learner_single.py b/hidimstat/estimator/tests/test_Dnn_learner_single.py index 4f28b51de..42037bfd2 100644 --- a/hidimstat/estimator/tests/test_Dnn_learner_single.py +++ b/hidimstat/estimator/tests/test_Dnn_learner_single.py @@ -1,4 +1,4 @@ -from hidimstat.estimator.test._utils_test import generate_data +from hidimstat.estimator.tests._utils_test import generate_data from hidimstat.estimator.Dnn_learner_single import DnnLearnerSingle import numpy as np diff --git a/hidimstat/estimator/tests/test_RandomForestModified.py b/hidimstat/estimator/tests/test_RandomForestModified.py index e625316e3..37bdcf8bf 100644 --- a/hidimstat/estimator/tests/test_RandomForestModified.py +++ b/hidimstat/estimator/tests/test_RandomForestModified.py @@ -1,4 +1,4 @@ -from hidimstat.estimator.test._utils_test import generate_data +from hidimstat.estimator.tests._utils_test import generate_data from hidimstat.estimator.RandomForestModified import RandomForestClassifierModified, RandomForestRegressorModified import numpy as np From b1638a3ca6892d1e2fac792e9af85ed877dfa84b Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Thu, 19 Dec 2024 17:27:49 +0100 Subject: [PATCH 10/19] Fix some typos --- hidimstat/estimator/tests/_utils_test.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/hidimstat/estimator/tests/_utils_test.py b/hidimstat/estimator/tests/_utils_test.py index a29e849ff..7d50cea92 100644 --- a/hidimstat/estimator/tests/_utils_test.py +++ b/hidimstat/estimator/tests/_utils_test.py @@ -1,7 +1,6 @@ from sklearn.datasets import make_classification, make_regression import numpy as np import pandas as pd -import pytest def generate_data( @@ -12,6 +11,8 @@ def generate_data( ): """ This function generates the synthetic data used in the different tests. + + Parameters ---------- n_samples : int, optional Number of samples to generate, by default 200 @@ -21,7 +22,7 @@ def generate_data( Type of problem to generate, by default "regression" (options: "regression", "classification") seed : int, optional Random seed, by default 2024 - ---------- + Returns ------- X : pd.DataFrame From e0075fcfbe2cfa58536689cd0f61c9e9c2b0177c Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Thu, 19 Dec 2024 17:30:17 +0100 Subject: [PATCH 11/19] Formating files --- hidimstat/estimator/_utils/u_Dnn_learner.py | 4 +-- hidimstat/estimator/tests/_utils_test.py | 6 ++--- hidimstat/estimator/tests/test_Dnn_learner.py | 11 +++++--- .../tests/test_Dnn_learner_single.py | 26 ++++++++++++------- .../tests/test_RandomForestModified.py | 12 ++++++--- src/hidimstat/estimator/Dnn_learner_single.py | 2 +- 6 files changed, 37 insertions(+), 24 deletions(-) diff --git a/hidimstat/estimator/_utils/u_Dnn_learner.py b/hidimstat/estimator/_utils/u_Dnn_learner.py index 150e3f262..7008b0313 100644 --- a/hidimstat/estimator/_utils/u_Dnn_learner.py +++ b/hidimstat/estimator/_utils/u_Dnn_learner.py @@ -1,4 +1,3 @@ - import numpy as np import copy import torch @@ -9,7 +8,6 @@ from sklearn.preprocessing import StandardScaler - def create_X_y( X, y, @@ -654,4 +652,4 @@ def dnn_net( best_loss, best_weight_stack, best_bias_stack, - ] \ No newline at end of file + ] diff --git a/hidimstat/estimator/tests/_utils_test.py b/hidimstat/estimator/tests/_utils_test.py index 7d50cea92..bef030fde 100644 --- a/hidimstat/estimator/tests/_utils_test.py +++ b/hidimstat/estimator/tests/_utils_test.py @@ -11,7 +11,7 @@ def generate_data( ): """ This function generates the synthetic data used in the different tests. - + Parameters ---------- n_samples : int, optional @@ -48,8 +48,8 @@ def generate_data( n_features=n_features, random_state=seed, ) - #y = np.array([str(i) for i in y]) + # y = np.array([str(i) for i in y]) X = pd.DataFrame(X, columns=[f"col{i+1}" for i in range(n_features)]) - return X, y \ No newline at end of file + return X, y diff --git a/hidimstat/estimator/tests/test_Dnn_learner.py b/hidimstat/estimator/tests/test_Dnn_learner.py index e8dad389f..1da5b4da5 100644 --- a/hidimstat/estimator/tests/test_Dnn_learner.py +++ b/hidimstat/estimator/tests/test_Dnn_learner.py @@ -8,11 +8,13 @@ def test_DNN_regression(): Test the DNN learner for regression. """ X, y = generate_data(problem_type="regression") - learner = DnnLearner(do_hypertuning=True, problem_type="regression", n_jobs=10, verbose=0) + learner = DnnLearner( + do_hypertuning=True, problem_type="regression", n_jobs=10, verbose=0 + ) learner.fit(X, np.expand_dims(y, axis=1)) predict = learner.predict(X)[0, :, 0] # Check if the predicted values are close to the true values for at least one instance - assert np.max(np.abs(predict-y)) < 4.0 + assert np.max(np.abs(predict - y)) < 4.0 # Check if the predicted values are close to the true values for at least one instance assert np.all(predict == y) or np.any(predict != y) # Check if the predicted values are not all the same @@ -28,7 +30,9 @@ def test_DNN_classification(): Test the DNN learner for classification. """ X, y = generate_data(problem_type="classification") - learner = DnnLearner(do_hypertuning=True, problem_type="classification", n_jobs=10, verbose=0) + learner = DnnLearner( + do_hypertuning=True, problem_type="classification", n_jobs=10, verbose=0 + ) learner.fit(X, np.expand_dims(y, axis=1)) predict_prob = learner.predict_proba(X) # Check if the predicted probabilities sum up to 1 for each instance @@ -55,4 +59,3 @@ def test_DNN_classification(): assert not np.all(predict_prob[:, 1] == 0) # Check if the predicted probabilities are not all ones for each class assert not np.all(predict_prob[:, 0] == 1) - diff --git a/hidimstat/estimator/tests/test_Dnn_learner_single.py b/hidimstat/estimator/tests/test_Dnn_learner_single.py index 42037bfd2..4c4f5fc4d 100644 --- a/hidimstat/estimator/tests/test_Dnn_learner_single.py +++ b/hidimstat/estimator/tests/test_Dnn_learner_single.py @@ -8,11 +8,13 @@ def test_DNN_single_regression(): Test the DNN learner single for regression. """ X, y = generate_data(problem_type="regression") - learner = DnnLearnerSingle(do_hypertuning=True, problem_type="regression", n_jobs=10, verbose=0) + learner = DnnLearnerSingle( + do_hypertuning=True, problem_type="regression", n_jobs=10, verbose=0 + ) learner.fit(X, y) - predict = learner.predict(X)[:,0] + predict = learner.predict(X)[:, 0] # Check if the predicted values are close to the true values for at least one instance - assert np.max(np.abs(predict-y)) < 4.0 + assert np.max(np.abs(predict - y)) < 4.0 # Check if the predicted values are close to the true values for at least one instance assert np.all(predict == y) or np.any(predict != y) # Check if the predicted values are not all the same @@ -28,7 +30,9 @@ def test_DNN_single_classification(): Test the DNN learner single for classification. """ X, y = generate_data(problem_type="classification") - learner = DnnLearnerSingle(do_hypertuning=True, problem_type="classification", n_jobs=10, verbose=0) + learner = DnnLearnerSingle( + do_hypertuning=True, problem_type="classification", n_jobs=10, verbose=0 + ) learner.fit(X, y) predict_prob = learner.predict_proba(X) # Check if the predicted probabilities sum up to 1 for each instance @@ -55,14 +59,16 @@ def test_DNN_single_classification(): assert not np.all(predict_prob[:, 1] == 0) # Check if the predicted probabilities are not all ones for each class assert not np.all(predict_prob[:, 0] == 1) - + def test_DNN_single_binary(): """ Test the DNN learner single for binary classification. """ X, y = generate_data(problem_type="classification") - learner = DnnLearnerSingle(do_hypertuning=True, problem_type="binary", n_jobs=10, verbose=0) + learner = DnnLearnerSingle( + do_hypertuning=True, problem_type="binary", n_jobs=10, verbose=0 + ) learner.fit(X, y) predict_prob = learner.predict_proba(X) # Check if the predicted probabilities sum up to 1 for each instance @@ -96,8 +102,10 @@ def test_DNN_single_ordinal(): Test the DNN learner single for ordinal. """ X, y = generate_data(problem_type="classification") - learner = DnnLearnerSingle(do_hypertuning=True, problem_type="ordinal", n_jobs=10, verbose=0) + learner = DnnLearnerSingle( + do_hypertuning=True, problem_type="ordinal", n_jobs=10, verbose=0 + ) learner.fit(X, y) - predict_prob = learner.predict_proba(X)[:,0] + predict_prob = learner.predict_proba(X)[:, 0] # Check if the predicted class labels match the true labels for at least one instance - #assert np.sum(np.abs((np.around(predict_prob)[:,0]-y))) == 0 \ No newline at end of file + # assert np.sum(np.abs((np.around(predict_prob)[:,0]-y))) == 0 diff --git a/hidimstat/estimator/tests/test_RandomForestModified.py b/hidimstat/estimator/tests/test_RandomForestModified.py index 37bdcf8bf..a7452c706 100644 --- a/hidimstat/estimator/tests/test_RandomForestModified.py +++ b/hidimstat/estimator/tests/test_RandomForestModified.py @@ -1,5 +1,8 @@ from hidimstat.estimator.tests._utils_test import generate_data -from hidimstat.estimator.RandomForestModified import RandomForestClassifierModified, RandomForestRegressorModified +from hidimstat.estimator.RandomForestModified import ( + RandomForestClassifierModified, + RandomForestRegressorModified, +) import numpy as np @@ -14,7 +17,7 @@ def test_RandomForestRegressorModified(): learner.fit(X, y) predict = learner.predict(X) # Check if the predicted values are close to the true values for at least one instance - assert np.max(np.abs(predict-y)) < 200.0 + assert np.max(np.abs(predict - y)) < 200.0 # Check if the predicted values are close to the true values for at least one instance assert np.all(predict == y) or np.any(predict != y) # Check if the predicted values are not all the same @@ -39,7 +42,8 @@ def test_RandomForestRegressorModified(): assert not np.allclose(learner.feature_importances_, 1) predictions = learner.sample_same_leaf(X) - #TODO: add more tests for sample_same_leaf + # TODO: add more tests for sample_same_leaf + def test_RandomForestClassifierModified(): """ @@ -99,4 +103,4 @@ def test_RandomForestClassifierModified(): assert not np.allclose(learner.feature_importances_, 1) predictions = learner.sample_same_leaf(X) - #TODO: add more tests for sample_same_leaf \ No newline at end of file + # TODO: add more tests for sample_same_leaf diff --git a/src/hidimstat/estimator/Dnn_learner_single.py b/src/hidimstat/estimator/Dnn_learner_single.py index 7b2ee8a45..9f487771b 100644 --- a/src/hidimstat/estimator/Dnn_learner_single.py +++ b/src/hidimstat/estimator/Dnn_learner_single.py @@ -300,7 +300,7 @@ def encode_outcome(self, y, train=True): if self.problem_type in ("classification", "binary"): curr_y = self.enc_y[col].transform(y[:, [col]]).toarray() list_y.append(curr_y) - + return np.array(list_y) def hyper_tuning( From 6217732c6b2a5fc8825b41fc08ff27e20c190089 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Thu, 19 Dec 2024 17:31:32 +0100 Subject: [PATCH 12/19] Change name of the files --- hidimstat/estimator/_utils/{u_Dnn_learner.py => Dnn.py} | 0 src/hidimstat/estimator/Dnn_learner_single.py | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename hidimstat/estimator/_utils/{u_Dnn_learner.py => Dnn.py} (100%) diff --git a/hidimstat/estimator/_utils/u_Dnn_learner.py b/hidimstat/estimator/_utils/Dnn.py similarity index 100% rename from hidimstat/estimator/_utils/u_Dnn_learner.py rename to hidimstat/estimator/_utils/Dnn.py diff --git a/src/hidimstat/estimator/Dnn_learner_single.py b/src/hidimstat/estimator/Dnn_learner_single.py index 9f487771b..6ffea2c12 100644 --- a/src/hidimstat/estimator/Dnn_learner_single.py +++ b/src/hidimstat/estimator/Dnn_learner_single.py @@ -8,7 +8,7 @@ from sklearn.preprocessing import OneHotEncoder from sklearn.utils.validation import check_is_fitted -from ._utils.u_Dnn_learner import ( +from ._utils.Dnn import ( create_X_y, dnn_net, joblib_ensemble_dnnet, From 3e2fbe00e6ac6a62df0bf7b90df08274788ae2ac Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Thu, 19 Dec 2024 18:19:50 +0100 Subject: [PATCH 13/19] formart utils --- hidimstat/utils.py | 174 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 174 insertions(+) create mode 100644 hidimstat/utils.py diff --git a/hidimstat/utils.py b/hidimstat/utils.py new file mode 100644 index 000000000..ad8bd8741 --- /dev/null +++ b/hidimstat/utils.py @@ -0,0 +1,174 @@ +import numpy as np + + +def quantile_aggregation(pvals, gamma=0.5, gamma_min=0.05, adaptive=False): + """ + This function implements the quantile aggregation method for p-values. + """ + # if pvalues are one-dimensional, do nothing + if pvals.shape[0] == 1: + return pvals[0] + if adaptive: + return _adaptive_quantile_aggregation(pvals, gamma_min) + else: + return _fixed_quantile_aggregation(pvals, gamma) + + +def fdr_threshold(pvals, fdr=0.1, method="bhq", reshaping_function=None): + if method == "bhq": + return _bhq_threshold(pvals, fdr=fdr) + elif method == "bhy": + return _bhy_threshold(pvals, fdr=fdr, reshaping_function=reshaping_function) + elif method == "ebh": + return _ebh_threshold(pvals, fdr=fdr) + else: + raise ValueError("{} is not support FDR control method".format(method)) + + +def cal_fdp_power(selected, non_zero_index, r_index=False): + """Calculate power and False Discovery Proportion + + Parameters + ---------- + selected: list index (in R format) of selected non-null variables + non_zero_index: true index of non-null variables + r_index : True if the index is taken from rpy2 inference + + Returns + ------- + fdp: False Discoveries Proportion + power: percentage of correctly selected variables over total number of + non-null variables + + """ + # selected is the index list in R and will be different from index of + # python by 1 unit + + if selected.size == 0: + return 0.0, 0.0 + + n_positives = len(non_zero_index) + + if r_index: + selected = selected - 1 + + true_positive = np.intersect1d(selected, non_zero_index) + false_positive = np.setdiff1d(selected, true_positive) + + fdp = len(false_positive) / max(1, len(selected)) + power = min(len(true_positive), n_positives) / n_positives + + return fdp, power + + +def _bhq_threshold(pvals, fdr=0.1): + """Standard Benjamini-Hochberg for controlling False discovery rate""" + n_features = len(pvals) + pvals_sorted = np.sort(pvals) + selected_index = 2 * n_features + for i in range(n_features - 1, -1, -1): + if pvals_sorted[i] <= fdr * (i + 1) / n_features: + selected_index = i + break + if selected_index <= n_features: + return pvals_sorted[selected_index] + else: + return -1.0 + + +def _ebh_threshold(evals, fdr=0.1): + """e-BH procedure for FDR control (see Wang and Ramdas 2021)""" + n_features = len(evals) + evals_sorted = -np.sort(-evals) # sort in descending order + selected_index = 2 * n_features + for i in range(n_features - 1, -1, -1): + if evals_sorted[i] >= n_features / (fdr * (i + 1)): + selected_index = i + break + if selected_index <= n_features: + return evals_sorted[selected_index] + else: + return np.infty + + +def _bhy_threshold(pvals, reshaping_function=None, fdr=0.1): + """Benjamini-Hochberg-Yekutieli procedure for controlling FDR, with input + shape function. Reference: Ramdas et al (2017) + """ + n_features = len(pvals) + pvals_sorted = np.sort(pvals) + selected_index = 2 * n_features + # Default value for reshaping function -- defined in + # Benjamini & Yekutieli (2001) + if reshaping_function is None: + temp = np.arange(n_features) + sum_inverse = np.sum(1 / (temp + 1)) + return _bhq_threshold(pvals, fdr / sum_inverse) + else: + for i in range(n_features - 1, -1, -1): + if pvals_sorted[i] <= fdr * reshaping_function(i + 1) / n_features: + selected_index = i + break + if selected_index <= n_features: + return pvals_sorted[selected_index] + else: + return -1.0 + + +def _fixed_quantile_aggregation(pvals, gamma=0.5): + """Quantile aggregation function based on Meinshausen et al (2008) + + Parameters + ---------- + pvals : 2D ndarray (n_sampling_with_repetition, n_test) + p-value (adjusted) + + gamma : float + Percentile value used for aggregation. + + Returns + ------- + 1D ndarray (n_tests, ) + Vector of aggregated p-values + """ + converted_score = (1 / gamma) * (np.percentile(pvals, q=100 * gamma, axis=0)) + + return np.minimum(1, converted_score) + + +def _adaptive_quantile_aggregation(pvals, gamma_min=0.05): + """adaptive version of the quantile aggregation method, Meinshausen et al. + (2008)""" + gammas = np.arange(gamma_min, 1.05, 0.05) + list_Q = np.array([_fixed_quantile_aggregation(pvals, gamma) for gamma in gammas]) + + return np.minimum(1, (1 - np.log(gamma_min)) * list_Q.min(0)) + + +def _lambda_max(X, y, use_noise_estimate=True): + """Calculation of lambda_max, the smallest value of regularization parameter in + lasso program for non-zero coefficient + """ + n_samples, _ = X.shape + + if not use_noise_estimate: + return np.max(np.dot(X.T, y)) / n_samples + + norm_y = np.linalg.norm(y, ord=2) + sigma_0 = (norm_y / np.sqrt(n_samples)) * 1e-3 + sig_star = max(sigma_0, norm_y / np.sqrt(n_samples)) + + return np.max(np.abs(np.dot(X.T, y)) / (n_samples * sig_star)) + + +def _check_vim_predict_method(method): + """Check if the method is a valid method for variable importance measure + prediction""" + if method in ["predict", "predict_proba", "decision_function", "transform"]: + return method + else: + raise ValueError( + "The method {} is not a valid method for variable importance measure prediction".format( + method + ) + ) From b2c706dbaa6facab10d49800e0edba011cc6a83d Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Thu, 19 Dec 2024 19:23:58 +0100 Subject: [PATCH 14/19] Remove not necessary comments --- hidimstat/estimator/tests/_utils_test.py | 3 --- hidimstat/estimator/tests/test_RandomForestModified.py | 2 -- 2 files changed, 5 deletions(-) diff --git a/hidimstat/estimator/tests/_utils_test.py b/hidimstat/estimator/tests/_utils_test.py index bef030fde..fd14ba6f7 100644 --- a/hidimstat/estimator/tests/_utils_test.py +++ b/hidimstat/estimator/tests/_utils_test.py @@ -29,8 +29,6 @@ def generate_data( Data matrix y : np.array Target vector - grps : np.array - Group vector """ rng = np.random.default_rng(seed) if problem_type == "regression": @@ -48,7 +46,6 @@ def generate_data( n_features=n_features, random_state=seed, ) - # y = np.array([str(i) for i in y]) X = pd.DataFrame(X, columns=[f"col{i+1}" for i in range(n_features)]) diff --git a/hidimstat/estimator/tests/test_RandomForestModified.py b/hidimstat/estimator/tests/test_RandomForestModified.py index a7452c706..63bc84bd9 100644 --- a/hidimstat/estimator/tests/test_RandomForestModified.py +++ b/hidimstat/estimator/tests/test_RandomForestModified.py @@ -9,8 +9,6 @@ def test_RandomForestRegressorModified(): """ Test the RandomForestRegressorModified for regression. - Parameters: - - regression_data: A tuple containing the input features (X) and target variable (y) for regression. """ X, y = generate_data(problem_type="regression") learner = RandomForestRegressorModified(n_jobs=10, verbose=0) From 20e09acb1c591f5fc26ca300805406d4ac7735aa Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 7 Mar 2025 10:44:12 +0100 Subject: [PATCH 15/19] Move to src folder --- {hidimstat => src/hidimstat}/estimator/_utils/Dnn.py | 0 {hidimstat/estimator/tests => test/estimator_tests}/__init__.py | 0 .../estimator/tests => test/estimator_tests}/_utils_test.py | 0 .../estimator/tests => test/estimator_tests}/test_Dnn_learner.py | 0 .../tests => test/estimator_tests}/test_Dnn_learner_single.py | 0 .../tests => test/estimator_tests}/test_RandomForestModified.py | 0 6 files changed, 0 insertions(+), 0 deletions(-) rename {hidimstat => src/hidimstat}/estimator/_utils/Dnn.py (100%) rename {hidimstat/estimator/tests => test/estimator_tests}/__init__.py (100%) rename {hidimstat/estimator/tests => test/estimator_tests}/_utils_test.py (100%) rename {hidimstat/estimator/tests => test/estimator_tests}/test_Dnn_learner.py (100%) rename {hidimstat/estimator/tests => test/estimator_tests}/test_Dnn_learner_single.py (100%) rename {hidimstat/estimator/tests => test/estimator_tests}/test_RandomForestModified.py (100%) diff --git a/hidimstat/estimator/_utils/Dnn.py b/src/hidimstat/estimator/_utils/Dnn.py similarity index 100% rename from hidimstat/estimator/_utils/Dnn.py rename to src/hidimstat/estimator/_utils/Dnn.py diff --git a/hidimstat/estimator/tests/__init__.py b/test/estimator_tests/__init__.py similarity index 100% rename from hidimstat/estimator/tests/__init__.py rename to test/estimator_tests/__init__.py diff --git a/hidimstat/estimator/tests/_utils_test.py b/test/estimator_tests/_utils_test.py similarity index 100% rename from hidimstat/estimator/tests/_utils_test.py rename to test/estimator_tests/_utils_test.py diff --git a/hidimstat/estimator/tests/test_Dnn_learner.py b/test/estimator_tests/test_Dnn_learner.py similarity index 100% rename from hidimstat/estimator/tests/test_Dnn_learner.py rename to test/estimator_tests/test_Dnn_learner.py diff --git a/hidimstat/estimator/tests/test_Dnn_learner_single.py b/test/estimator_tests/test_Dnn_learner_single.py similarity index 100% rename from hidimstat/estimator/tests/test_Dnn_learner_single.py rename to test/estimator_tests/test_Dnn_learner_single.py diff --git a/hidimstat/estimator/tests/test_RandomForestModified.py b/test/estimator_tests/test_RandomForestModified.py similarity index 100% rename from hidimstat/estimator/tests/test_RandomForestModified.py rename to test/estimator_tests/test_RandomForestModified.py From 7afc7e55ec797a4ab9bee4a560b7d0690c1ad41d Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 7 Mar 2025 10:48:13 +0100 Subject: [PATCH 16/19] remove old folder --- hidimstat/utils.py | 174 --------------------------------------------- 1 file changed, 174 deletions(-) delete mode 100644 hidimstat/utils.py diff --git a/hidimstat/utils.py b/hidimstat/utils.py deleted file mode 100644 index ad8bd8741..000000000 --- a/hidimstat/utils.py +++ /dev/null @@ -1,174 +0,0 @@ -import numpy as np - - -def quantile_aggregation(pvals, gamma=0.5, gamma_min=0.05, adaptive=False): - """ - This function implements the quantile aggregation method for p-values. - """ - # if pvalues are one-dimensional, do nothing - if pvals.shape[0] == 1: - return pvals[0] - if adaptive: - return _adaptive_quantile_aggregation(pvals, gamma_min) - else: - return _fixed_quantile_aggregation(pvals, gamma) - - -def fdr_threshold(pvals, fdr=0.1, method="bhq", reshaping_function=None): - if method == "bhq": - return _bhq_threshold(pvals, fdr=fdr) - elif method == "bhy": - return _bhy_threshold(pvals, fdr=fdr, reshaping_function=reshaping_function) - elif method == "ebh": - return _ebh_threshold(pvals, fdr=fdr) - else: - raise ValueError("{} is not support FDR control method".format(method)) - - -def cal_fdp_power(selected, non_zero_index, r_index=False): - """Calculate power and False Discovery Proportion - - Parameters - ---------- - selected: list index (in R format) of selected non-null variables - non_zero_index: true index of non-null variables - r_index : True if the index is taken from rpy2 inference - - Returns - ------- - fdp: False Discoveries Proportion - power: percentage of correctly selected variables over total number of - non-null variables - - """ - # selected is the index list in R and will be different from index of - # python by 1 unit - - if selected.size == 0: - return 0.0, 0.0 - - n_positives = len(non_zero_index) - - if r_index: - selected = selected - 1 - - true_positive = np.intersect1d(selected, non_zero_index) - false_positive = np.setdiff1d(selected, true_positive) - - fdp = len(false_positive) / max(1, len(selected)) - power = min(len(true_positive), n_positives) / n_positives - - return fdp, power - - -def _bhq_threshold(pvals, fdr=0.1): - """Standard Benjamini-Hochberg for controlling False discovery rate""" - n_features = len(pvals) - pvals_sorted = np.sort(pvals) - selected_index = 2 * n_features - for i in range(n_features - 1, -1, -1): - if pvals_sorted[i] <= fdr * (i + 1) / n_features: - selected_index = i - break - if selected_index <= n_features: - return pvals_sorted[selected_index] - else: - return -1.0 - - -def _ebh_threshold(evals, fdr=0.1): - """e-BH procedure for FDR control (see Wang and Ramdas 2021)""" - n_features = len(evals) - evals_sorted = -np.sort(-evals) # sort in descending order - selected_index = 2 * n_features - for i in range(n_features - 1, -1, -1): - if evals_sorted[i] >= n_features / (fdr * (i + 1)): - selected_index = i - break - if selected_index <= n_features: - return evals_sorted[selected_index] - else: - return np.infty - - -def _bhy_threshold(pvals, reshaping_function=None, fdr=0.1): - """Benjamini-Hochberg-Yekutieli procedure for controlling FDR, with input - shape function. Reference: Ramdas et al (2017) - """ - n_features = len(pvals) - pvals_sorted = np.sort(pvals) - selected_index = 2 * n_features - # Default value for reshaping function -- defined in - # Benjamini & Yekutieli (2001) - if reshaping_function is None: - temp = np.arange(n_features) - sum_inverse = np.sum(1 / (temp + 1)) - return _bhq_threshold(pvals, fdr / sum_inverse) - else: - for i in range(n_features - 1, -1, -1): - if pvals_sorted[i] <= fdr * reshaping_function(i + 1) / n_features: - selected_index = i - break - if selected_index <= n_features: - return pvals_sorted[selected_index] - else: - return -1.0 - - -def _fixed_quantile_aggregation(pvals, gamma=0.5): - """Quantile aggregation function based on Meinshausen et al (2008) - - Parameters - ---------- - pvals : 2D ndarray (n_sampling_with_repetition, n_test) - p-value (adjusted) - - gamma : float - Percentile value used for aggregation. - - Returns - ------- - 1D ndarray (n_tests, ) - Vector of aggregated p-values - """ - converted_score = (1 / gamma) * (np.percentile(pvals, q=100 * gamma, axis=0)) - - return np.minimum(1, converted_score) - - -def _adaptive_quantile_aggregation(pvals, gamma_min=0.05): - """adaptive version of the quantile aggregation method, Meinshausen et al. - (2008)""" - gammas = np.arange(gamma_min, 1.05, 0.05) - list_Q = np.array([_fixed_quantile_aggregation(pvals, gamma) for gamma in gammas]) - - return np.minimum(1, (1 - np.log(gamma_min)) * list_Q.min(0)) - - -def _lambda_max(X, y, use_noise_estimate=True): - """Calculation of lambda_max, the smallest value of regularization parameter in - lasso program for non-zero coefficient - """ - n_samples, _ = X.shape - - if not use_noise_estimate: - return np.max(np.dot(X.T, y)) / n_samples - - norm_y = np.linalg.norm(y, ord=2) - sigma_0 = (norm_y / np.sqrt(n_samples)) * 1e-3 - sig_star = max(sigma_0, norm_y / np.sqrt(n_samples)) - - return np.max(np.abs(np.dot(X.T, y)) / (n_samples * sig_star)) - - -def _check_vim_predict_method(method): - """Check if the method is a valid method for variable importance measure - prediction""" - if method in ["predict", "predict_proba", "decision_function", "transform"]: - return method - else: - raise ValueError( - "The method {} is not a valid method for variable importance measure prediction".format( - method - ) - ) From fd655432832e2b1f5d494c97624c9ea285e52413 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 7 Mar 2025 10:48:50 +0100 Subject: [PATCH 17/19] Remove unecesary function from utils --- src/hidimstat/utils.py | 117 +---------------------------------------- 1 file changed, 1 insertion(+), 116 deletions(-) diff --git a/src/hidimstat/utils.py b/src/hidimstat/utils.py index 2c112a000..4dba00627 100644 --- a/src/hidimstat/utils.py +++ b/src/hidimstat/utils.py @@ -1,4 +1,3 @@ -import copy import numpy as np @@ -326,118 +325,4 @@ def _alpha_max(X, y, use_noise_estimate=False): alpha_max = np.max(np.abs(np.dot(X.T, y)) / (n_samples * sigma_star)) - return alpha_max - - -########################### Data Preprocessing ########################## -def create_X_y( - X, - y, - sampling_with_repetition=True, - split_percentage=0.8, - problem_type="regression", - list_continuous=None, - random_state=None, -): - """ - Create train/valid split of input data X and target variable y - - Parameters - ---------- - X : {array-like, sparse matrix} of shape (n_samples, n_features) - The input samples before the splitting process. - y : ndarray, shape (n_samples, ) - The output samples before the splitting process. - sampling_with_repetition : bool, default=True - Sampling with repetition the train part of the train/valid scheme under - the training set. The number of training samples in train is equal to - the number of instances in the training set. - split_percentage : float, default=0.8 - The training/validation cut for the provided data. - problem_type : str, default='regression' - A classification or a regression problem. - list_continuous : list, default=[] - The list of continuous variables. - random_state : int, default=2023 - Fixing the seeds of the random generator. - - Returns - ------- - X_train_scaled : {array-like, sparse matrix} of shape (n_train_samples, n_features) - The training input samples with scaled continuous variables. - y_train_scaled : {array-like} of shape (n_train_samples, ) - The sampling_with_repetitionped training output samples scaled if continous. - X_validation_scaled : {array-like, sparse matrix} of shape (n_validation_samples, n_features) - The validation input samples with scaled continuous variables. - y_validation_scaled : {array-like} of shape (n_validation_samples, ) - The validation output samples scaled if continous. - X_scaled : {array-like, sparse matrix} of shape (n_samples, n_features) - The original input samples with scaled continuous variables. - y_validation : {array-like} of shape (n_samples, ) - The original output samples with validation indices. - scaler_x : Scikit-learn StandardScaler - The standard scaler encoder for the continuous variables of the input. - scaler_y : Scikit-learn StandardScaler - The standard scaler encoder for the output if continuous. - valid_ind : list - The list of indices of the validation set. - """ - rng = np.random.RandomState(random_state) - scaler_x, scaler_y = StandardScaler(), StandardScaler() - n = X.shape[0] - - if sampling_with_repetition: - train_ind = rng.choice(n, n, replace=True) - else: - train_ind = rng.choice( - n, size=int(np.floor(split_percentage * n)), replace=False - ) - valid_ind = np.array([ind for ind in range(n) if ind not in train_ind]) - - X_train, X_validation = X[train_ind], X[valid_ind] - y_train, y_validation = y[train_ind], y[valid_ind] - - # Scaling X and y - X_train_scaled = X_train.copy() - X_validation_scaled = X_validation.copy() - X_scaled = X.copy() - - if len(list_continuous) > 0: - X_train_scaled[:, list_continuous] = scaler_x.fit_transform( - X_train[:, list_continuous] - ) - X_validation_scaled[:, list_continuous] = scaler_x.transform( - X_validation[:, list_continuous] - ) - X_scaled[:, list_continuous] = scaler_x.transform(X[:, list_continuous]) - if problem_type == "regression": - y_train_scaled = scaler_y.fit_transform(y_train) - y_validation_scaled = scaler_y.transform(y_validation) - else: - y_train_scaled = y_train.copy() - y_validation_scaled = y_validation.copy() - - return ( - X_train_scaled, - y_train_scaled, - X_validation_scaled, - y_validation_scaled, - X_scaled, - y_validation, - scaler_x, - scaler_y, - valid_ind, - ) - - -def _check_vim_predict_method(method): - """Check if the method is a valid method for variable importance measure - prediction""" - if method in ["predict", "predict_proba", "decision_function", "transform"]: - return method - else: - raise ValueError( - "The method {} is not a valid method for variable importance measure prediction".format( - method - ) - ) \ No newline at end of file + return alpha_max \ No newline at end of file From bec739b99d04aef92c2b91a82a79f7ef8e03cf8c Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 7 Mar 2025 11:01:34 +0100 Subject: [PATCH 18/19] Remove RF --- .../estimator/RandomForestModified.py | 99 ----------------- .../test_RandomForestModified.py | 104 ------------------ 2 files changed, 203 deletions(-) delete mode 100644 src/hidimstat/estimator/RandomForestModified.py delete mode 100644 test/estimator_tests/test_RandomForestModified.py diff --git a/src/hidimstat/estimator/RandomForestModified.py b/src/hidimstat/estimator/RandomForestModified.py deleted file mode 100644 index 097e3d473..000000000 --- a/src/hidimstat/estimator/RandomForestModified.py +++ /dev/null @@ -1,99 +0,0 @@ -import numpy as np -from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor - - -class RandomForestClassifierModified(RandomForestClassifier): - def fit(self, X, y): - self.y_ = y - return super().fit(X, y) - - def predict(self, X): - return super().predict(X) - - def predict_proba(self, X): - return super().predict_proba(X) - - def sample_same_leaf(self, X, y=None): - if not (y is None): - self.y_ = y - rng = np.random.RandomState(self.get_params()["random_state"]) - # The number of samples to sample from in the next step - random_samples = 10 - - # Get the leaf indices for each sample in the input data - leaf_indices = self.apply(X) - - # Initialize an array to store the predictions for each sample - predictions = [] - - # Loop over each sample in the input data - for i in range(X.shape[0]): - # Removing the row of the corresponding input sample - leaf_indices_minus_i = np.delete(leaf_indices, i, axis=0) - y_minus_i = np.delete(self.y_, i, axis=0) - - # The list of indices sampled from the same leaf of the input sample - leaf_samples = [] - # Loop over each tree in the forest - for j in range(self.n_estimators): - # Find the samples that fall into the same leaf node for this tree - samples_in_leaf = np.where( - leaf_indices_minus_i[:, j] == leaf_indices[i, j] - )[0] - - # Append the samples to the list - if samples_in_leaf.size > 0: - leaf_samples.append( - y_minus_i[rng.choice(samples_in_leaf, size=random_samples)] - ) - - predictions.append(leaf_samples) - - # Combine the predictions from all trees to make the final prediction - return np.array(predictions, dtype=object) - - -class RandomForestRegressorModified(RandomForestRegressor): - def fit(self, X, y): - self.y_ = y - return super().fit(X, y) - - def predict(self, X): - return super().predict(X) - - def sample_same_leaf(self, X): - rng = np.random.RandomState(self.get_params()["random_state"]) - # The number of samples to sample from in the next step - random_samples = 10 - - # Get the leaf indices for each sample in the input data - leaf_indices = self.apply(X) - - # Initialize an array to store the predictions for each sample - predictions = [] - - # Loop over each sample in the input data - for i in range(X.shape[0]): - # Removing the row of the corresponding input sample - leaf_indices_minus_i = np.delete(leaf_indices, i, axis=0) - y_minus_i = np.delete(self.y_, i, axis=0) - - # The list of indices sampled from the same leaf of the input sample - leaf_samples = [] - # Loop over each tree in the forest - for j in range(self.n_estimators): - # Find the samples that fall into the same leaf node for this tree - samples_in_leaf = np.where( - leaf_indices_minus_i[:, j] == leaf_indices[i, j] - )[0] - - # Append the samples to the list - if samples_in_leaf.size > 0: - leaf_samples.append( - y_minus_i[rng.choice(samples_in_leaf, size=random_samples)] - ) - - predictions.append(leaf_samples) - - # Combine the predictions from all trees to make the final prediction - return np.array(predictions, dtype=object) diff --git a/test/estimator_tests/test_RandomForestModified.py b/test/estimator_tests/test_RandomForestModified.py deleted file mode 100644 index 63bc84bd9..000000000 --- a/test/estimator_tests/test_RandomForestModified.py +++ /dev/null @@ -1,104 +0,0 @@ -from hidimstat.estimator.tests._utils_test import generate_data -from hidimstat.estimator.RandomForestModified import ( - RandomForestClassifierModified, - RandomForestRegressorModified, -) -import numpy as np - - -def test_RandomForestRegressorModified(): - """ - Test the RandomForestRegressorModified for regression. - """ - X, y = generate_data(problem_type="regression") - learner = RandomForestRegressorModified(n_jobs=10, verbose=0) - learner.fit(X, y) - predict = learner.predict(X) - # Check if the predicted values are close to the true values for at least one instance - assert np.max(np.abs(predict - y)) < 200.0 - # Check if the predicted values are close to the true values for at least one instance - assert np.all(predict == y) or np.any(predict != y) - # Check if the predicted values are not all the same - assert not np.all(predict == predict[0]) - # Check if the predicted values are not all zeros - assert not np.all(predict == 0) - # Check if the predicted values are not all ones - assert not np.all(predict == 1) - # Check if the feature importances are not all zeros - assert not np.all(learner.feature_importances_ == 0) - # Check if the feature importances are not all the same - assert not np.all(learner.feature_importances_ == learner.feature_importances_[0]) - # Check if the feature importances are not all ones - assert not np.all(learner.feature_importances_ == 1) - # Check if the feature importances are not all negative - assert not np.all(learner.feature_importances_ < 0) - # # Check if the feature importances are not all positive - # assert not np.all(learner.feature_importances_ > 0) - # Check if the feature importances are not all close to zero - assert not np.allclose(learner.feature_importances_, 0) - # Check if the feature importances are not all close to one - assert not np.allclose(learner.feature_importances_, 1) - - predictions = learner.sample_same_leaf(X) - # TODO: add more tests for sample_same_leaf - - -def test_RandomForestClassifierModified(): - """ - Test the RandomForestClassifierModified for classification. - """ - X, y = generate_data(problem_type="classification") - learner = RandomForestClassifierModified(n_jobs=10, verbose=0) - learner.fit(X, y) - predict_prob = learner.predict_proba(X) - # Check if the predicted probabilities sum up to 1 for each instance - assert np.allclose(np.sum(predict_prob, axis=1), 1) - # Check if the predicted class labels match the true labels for at least one instance - assert np.sum(np.argmax(predict_prob, axis=1) == y) > 0 - assert np.all(np.max(predict_prob, axis=1) >= 0.5) - assert np.all(np.min(predict_prob, axis=1) < 0.5) - # Check if the maximum predicted probability is greater than 0.95 - assert 0.95 < np.max(predict_prob) - # Check if the minimum predicted probability is less than 0.05 - assert 0.05 > np.min(predict_prob) - # Check if the predicted probabilities are not all the same - assert not np.all(predict_prob == predict_prob[0]) - # Check if the predicted probabilities are not all zeros - assert not np.all(predict_prob == 0) - # Check if the predicted probabilities are not all ones - assert not np.all(predict_prob == 1) - # Check if the predicted probabilities are not all the same for each class - assert not np.all(predict_prob[:, 0] == predict_prob[0, 0]) - assert not np.all(predict_prob[:, 1] == predict_prob[0, 1]) - # Check if the predicted probabilities are not all zeros for each class - assert not np.all(predict_prob[:, 0] == 0) - assert not np.all(predict_prob[:, 1] == 0) - # Check if the predicted probabilities are not all ones for each class - assert not np.all(predict_prob[:, 0] == 1) - - predict = learner.predict(X) - # Check if the predicted values are close to the true values for at least one instance - assert np.all(predict == y) or np.any(predict != y) - # Check if the predicted values are not all the same - assert not np.all(predict == predict[0]) - # Check if the predicted values are not all zeros - assert not np.all(predict == 0) - # Check if the predicted values are not all ones - assert not np.all(predict == 1) - # Check if the feature importances are not all zeros - assert not np.all(learner.feature_importances_ == 0) - # Check if the feature importances are not all the same - assert not np.all(learner.feature_importances_ == learner.feature_importances_[0]) - # Check if the feature importances are not all ones - assert not np.all(learner.feature_importances_ == 1) - # Check if the feature importances are not all negative - assert not np.all(learner.feature_importances_ < 0) - # # Check if the feature importances are not all positive - # assert not np.all(learner.feature_importances_ > 0) - # Check if the feature importances are not all close to zero - assert not np.allclose(learner.feature_importances_, 0) - # Check if the feature importances are not all close to one - assert not np.allclose(learner.feature_importances_, 1) - - predictions = learner.sample_same_leaf(X) - # TODO: add more tests for sample_same_leaf From 7d3077221a5edc069206c887193f01a34fe26ed4 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Tue, 11 Mar 2025 13:07:17 +0100 Subject: [PATCH 19/19] Remove example link to RandomForest --- .../plot_residuals_sampling.py | 237 ------------------ 1 file changed, 237 deletions(-) delete mode 100644 examples_not_exhibited/plot_residuals_sampling.py diff --git a/examples_not_exhibited/plot_residuals_sampling.py b/examples_not_exhibited/plot_residuals_sampling.py deleted file mode 100644 index ec30d6ec4..000000000 --- a/examples_not_exhibited/plot_residuals_sampling.py +++ /dev/null @@ -1,237 +0,0 @@ -""" -Conditional sampling using residuals vs sampling Random Forest -============================================================== - -To deploy the Conditional Permutation Importance (CPI), -:footcite:t:`Chamma_NeurIPS2023` described two main approaches for the -conditional scheme: 1) Instead of directly permuting the variable of interest as -in the Permutation Feature Importance (PFI), the residuals of the prediction of -the variable of interest x_j based on the remaining variables is first computed -along with a predicted version x_hat_j. These residuals are shuffled and added -to the predicted version to recreate the variable of interest (Preserving the -dependency between the variable of interest and the remaining variables while -breaking the relationship with the outcome). 2) Another option is to use the -sampling Random Forest. Using the remaining variables to predict the variable of -interest, and instead of predicting the variable of interest as the mean of the -instances' outcome of the targeted leaf or the class with the most occurences, -we sample from the same leaf of the instance of interest within its neighbors, -and we follow the standard path of the Random Forest. - -References ----------- -.. footbibliography:: - -""" - -############################################################################# -# Imports needed for this script -# ------------------------------ - -from hidimstat import BlockBasedImportance -from joblib import Parallel, delayed -import matplotlib.pyplot as plt -import numpy as np -import pandas as pd -from scipy.linalg import cholesky -from scipy.stats import norm -from sklearn.ensemble import RandomForestRegressor -from sklearn.metrics import roc_auc_score -import time - -n, p = (100, 12) -inter_cor, intra_cor = (0, 0.85) -n_blocks = 1 -n_signal = 2 -problem_type = "regression" -snr = 4 -rf = RandomForestRegressor(random_state=2023) -dict_hyper = {"max_depth": [2, 5, 10, 20]} - -############################################################################# -# Generate the synthetic data -# --------------------------- -# The function below generates the correlation matrix between the variables -# according to the provided degrees of correlation (intra + inter). `inter_cor` -# indicates the degree of correlation between the variables/groups whereas -# `intra_cor` specifies the corresponding degree between the variables within -# each group. For the single-level case, `n_blocks` is set to 1 and the -# `intra_cor` is the unique correlation between variables. -# -# Next, we generate the synthetic data by randomly drawing n_signal predictors -# from the corresponding p variables and reordering the set of variables to put the -# n_signal predictors at the beginning. Following, the response is generated -# under a simple linear model with Gaussian noise. - - -def generate_cor_blocks(p, inter_cor, intra_cor, n_blocks): - vars_per_grp = int(p / n_blocks) - cor_mat = np.zeros((p, p)) - cor_mat.fill(inter_cor) - for i in range(n_blocks): - cor_mat[ - (i * vars_per_grp) : ((i + 1) * vars_per_grp), - (i * vars_per_grp) : ((i + 1) * vars_per_grp), - ] = intra_cor - np.fill_diagonal(cor_mat, 1) - return cor_mat - - -def _generate_data(seed): - rng = np.random.RandomState(seed) - - cor_mat = generate_cor_blocks(p, inter_cor, intra_cor, n_blocks) - x = norm.rvs(size=(p, n), random_state=seed) - c = cholesky(cor_mat, lower=True) - X = pd.DataFrame(np.dot(c, x).T, columns=[str(i) for i in np.arange(p)]) - - data = X.copy() - - # Randomly draw n_signal predictors which are defined as signal predictors - indices_var = list(rng.choice(range(data.shape[1]), size=n_signal, replace=False)) - - # Reorder data matrix so that first n_signal predictors are the signal predictors - # List of remaining indices - indices_rem = [ind for ind in range(data.shape[1]) if ind not in indices_var] - total_indices = indices_var + indices_rem - # Before including the non-linear effects - data = data.iloc[:, total_indices] - data_signal = data.iloc[:, np.arange(n_signal)] - - # Determine beta coefficients - effectset = [-0.5, -1, -2, -3, 0.5, 1, 2, 3] - beta = rng.choice(effectset, size=data_signal.shape[1], replace=True) - - # Generate response - # The product of the signal predictors with the beta coefficients - prod_signal = np.dot(data_signal, beta) - - sigma_noise = np.linalg.norm(prod_signal, ord=2) / ( - snr * np.sqrt(data_signal.shape[0]) - ) - y = prod_signal + sigma_noise * rng.normal(size=prod_signal.shape[0]) - - return data, y - - -############################################################################# -# Processing across multiple permutations -# --------------------------------------- -# In order to get statistical significance with p-values, we run the experiments -# across 10 repetitions. -# - - -def compute_simulations(seed): - X, y = _generate_data(seed) - # Using the residuals - start_residuals = time.time() - bbi_residual = BlockBasedImportance( - estimator="RF", - importance_estimator="residuals_RF", - do_hypertuning=True, - dict_hypertuning=None, - conditional=True, - n_permutations=10, - n_jobs=2, - problem_type="regression", - k_fold=2, - variables_categories={}, - ) - bbi_residual.fit(X, y) - results_bbi_residual = bbi_residual.compute_importance() - - df_residuals = {} - df_residuals["method"] = ["residuals"] * X.shape[1] - df_residuals["score"] = [results_bbi_residual["score_R2"]] * X.shape[1] - df_residuals["elapsed"] = [time.time() - start_residuals] * X.shape[1] - df_residuals["importance"] = np.ravel(results_bbi_residual["importance"]) - df_residuals["p-value"] = np.ravel(results_bbi_residual["pval"]) - df_residuals["iteration"] = [seed] * X.shape[1] - df_residuals = pd.DataFrame(df_residuals) - - # Using the sampling RF - start_sampling = time.time() - bbi_sampling = BlockBasedImportance( - estimator="RF", - importance_estimator="sampling_RF", - do_hypertuning=True, - dict_hypertuning=None, - conditional=True, - n_permutations=10, - n_jobs=2, - problem_type="regression", - k_fold=2, - variables_categories={}, - ) - bbi_sampling.fit(X, y) - results_bbi_sampling = bbi_sampling.compute_importance() - - df_sampling = {} - df_sampling["method"] = ["sampling"] * X.shape[1] - df_sampling["score"] = [results_bbi_sampling["score_R2"]] * X.shape[1] - df_sampling["elapsed"] = [time.time() - start_sampling] * X.shape[1] - df_sampling["importance"] = np.ravel(results_bbi_sampling["importance"]) - df_sampling["p-value"] = np.ravel(results_bbi_sampling["pval"]) - df_sampling["iteration"] = [seed] * X.shape[1] - df_sampling = pd.DataFrame(df_sampling) - - df_final = pd.concat([df_residuals, df_sampling], axis=0) - return df_final - - -# Running across 10 repetitions -parallel = Parallel(n_jobs=2, verbose=1) -final_result = parallel( - delayed(compute_simulations)(seed=seed) for seed in np.arange(1, 11) -) - -############################################################################# -# Plotting AUC score and Type-I error -# ----------------------------------- -# With the prediction problems turns to be a binary classification problem for -# the variables being relevant or non-relevant vs the ground-truth, we measure -# the performance in terms of type-I error i.e. the rate of true non-relevant -# variables detected as relevant and AUC score related to correct significant -# variables ordering. -# - -df_final_result = pd.concat(final_result, axis=0).reset_index(drop=True) -df_auc = df_final_result.groupby(by=["method", "iteration"]).apply( - lambda x: roc_auc_score([1] * n_signal + [0] * (p - n_signal), -x["p-value"]) -) -df_auc = df_auc.reset_index(name="auc") -df_type_I = df_final_result.groupby(by=["method", "iteration"]).apply( - lambda x: sum(x.iloc[n_signal:, :]["p-value"] <= 0.05) / x.iloc[2:, :].shape[0] -) -df_type_I = df_type_I.reset_index(name="type-I") - -auc = [ - np.array(df_auc["auc"])[: int(df_auc.shape[0] / 2)], - np.array(df_auc["auc"])[int(df_auc.shape[0] / 2) :], -] -typeI_error = [ - np.array(df_type_I["type-I"])[: int(df_type_I.shape[0] / 2)], - np.array(df_type_I["type-I"])[int(df_type_I.shape[0] / 2) :], -] - -fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(8, 4), sharey=True) - -# AUC score -axs[0].violinplot(auc, showmeans=False, showmedians=True, vert=False) -axs[0].set_title("AUC score") -axs[0].xaxis.grid(True) -axs[0].set_yticks([x + 1 for x in range(len(auc))], labels=["Residuals", "Sampling"]) -axs[0].set_ylabel("Method") - -# Type-I Error -axs[1].violinplot(typeI_error, showmeans=False, showmedians=True, vert=False) -axs[1].set_title("Type-I Error") -axs[1].axvline(x=0.05, color="r", label="Nominal Rate") -plt.show() - -############################################################################# -# Analysis of the results -# ----------------------- -# We can observe that the sampling approaches'performance is almost similar to -# that of the residuals. Sampling accelerates the conditional importance -# computation by simplifying the residuals steps.