Skip to content

Problem with many highly correlated features #22

@woznicak

Description

@woznicak

I have data with highly correlated variables and many of them are clustering together at height 0 at algorithms. The function hierarchical_importance does not work with this example.

Below I provide reproducible example.

library(dplyr)
library(triplot)
library(DALEX)
library(gbm)

## download https://github.com/woznicak/MetaFeaturesImpact/blob/master/summary_results_surrogate_models_rank_per_algo.Rd
## This is the Rdata object with list of explainers

summary_results <- readRDS('summary_results_surrogate_models_rank_per_algo.Rd')

explainer_gbm <- summary_results$explainer_GBM_deep[[11]]

tri_var_imp <- calculate_triplot(explainer_gbm, 
                                 data = explainer_gbm$data,
                                 y = explainer_gbm$y,
                                 new_observation = explainer_gbm$data[1,],
                                 predict_function = sexplainer_gbm$predict_function)

I work around this problem and this is my fixing of function

hierarchical_importance <- function (x, data, y = NULL, predict_function = predict, type = "predict", 
          new_observation = NULL, N = 1000, loss_function = DALEX::loss_root_mean_square, 
          B = 10, fi_type = c("raw", "ratio", "difference"), clust_method = "complete", 
          cor_method = "spearman", ...) 
{
  if (all(type != "predict", is.null(y))) {
    stop("Target is needed for hierarchical_importance calculated at model \n         level")
  }
  fi_type <- match.arg(fi_type)
  x_hc <- hclust(as.dist(1 - abs(cor(data, method = cor_method))), 
                 method = clust_method)
  cutting_heights <- x_hc$height
  # aspects_list_previous <- list_variables(x_hc, 1)
  aspects_list_previous <- as.list(colnames(data))
  int_node_importance <- as.data.frame(NULL)
  for (i in c(1:(length(cutting_heights) - 1))) {
    aspects_list_current <- list_variables(x_hc, 1 - cutting_heights[i])
    t1 <- match(aspects_list_current, setdiff(aspects_list_current, 
                                              aspects_list_previous))
    for(k in na.omit(t1)){
      t2 <- which(t1 == k)
      t3 <- aspects_list_current[t2]
      group_name <- names(t3)
      if (type != "predict") {
        
        explainer <- explain(model = x, data = data, y = y,
                             predict_function = predict_function, verbose = FALSE)
        res_ai <- feature_importance(explainer = explainer,
                                     variable_groups = aspects_list_current, N = N,
                                     loss_function = loss_function, B = B, type = fi_type)
        
        
       
        class(res_ai) <- c("model_parts" ,"feature_importance_explainer", class(res_ai))
        
        
        
        res_ai <- res_ai[res_ai$permutation == "0", ]
        int_node_importance[nrow(int_node_importance) + 1, 1] <- res_ai[res_ai$variable == 
                                                                          group_name, ]$dropout_loss
      }
      else {
        res_ai <- aspect_importance(x = x, data = data, 
                                    predict_function = predict_function, new_observation = new_observation, 
                                    variable_groups = aspects_list_current, N = N)
        int_node_importance[nrow(int_node_importance) +1 , 1] <- res_ai[res_ai$variable_groups == 
                                                                          group_name, ]$importance
      }
      int_node_importance[nrow(int_node_importance), 2] <- group_name
      int_node_importance[nrow(int_node_importance), 3] <- cutting_heights[i]
    }
    
    aspects_list_previous <- aspects_list_current
  }
  if (type != "predict") {
    res <- feature_importance(explainer = explainer, variable_groups = , N = N, loss_function = loss_function, B = B)
    res <- res[res$permutation == "0", ]
    baseline_val <- res[res$variable == "aspect.group1", 
    ]$dropout_loss
    int_node_importance[length(cutting_heights), 1] <- baseline_val
  }
  else {
    int_node_importance[(nrow(int_node_importance)+1):length(cutting_heights), 1] <- NA
  }
  x_hc$height <- int_node_importance$V1
  hi <- list(x_hc, type, new_observation)
  class(hi) <- c("hierarchical_importance")
  return(hi)
}

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions