I used a Random Survival Forest with 10 estimators and a max depth of 25 on approximately 1800 data samples. The full dataset otherwise contains approximately 200,000 data samples, but I intentionally only used a very small sample when I encountered this error.
When attempting to fit a ModelSurvSHAP on this very small dummy random survival forest I encounter the following error: MemoryError: Unable to allocate 512. TiB for an array with shape (8388608, 8388608) and data type float64
I'm using survshap version 0.4.2.
---------------------------------------------------------------------------
MemoryError Traceback (most recent call last)
Cell In[38], line 6
3 rsf_exp = SurvivalModelExplainer(rsf, X_test, y_test)
5 exp1_survshap_global_rsf = ModelSurvSHAP(random_state=42)
----> 6 exp1_survshap_global_rsf.fit(rsf_exp)
File c:\Users\alenk\anaconda3\envs\azureml_py310_sdkv2\lib\site-packages\survshap\model_explanations\object.py:76, in ModelSurvSHAP.fit(self, explainer, new_observations, timestamps, save_individual_explanations, **kwargs)
69 if new_observations is None:
70 new_observations = explainer.data
72 (
73 self.full_result,
74 self.individual_explanations,
75 self.timestamps,
---> 76 ) = calculate_individual_explanations(
77 explainer,
78 new_observations,
79 self.function_type,
80 self.path,
81 self.B,
82 self.max_shap_value_inputs,
83 self.random_state,
84 self.calculation_method,
85 self.aggregation_method,
86 timestamps,
87 save_individual_explanations,
88 **kwargs
89 )
91 names = explainer.y.dtype.names
92 self.event_ind = explainer.y[names[0]]
File c:\Users\alenk\anaconda3\envs\azureml_py310_sdkv2\lib\site-packages\survshap\model_explanations\utils.py:127, in calculate_individual_explanations(explainer, new_observations, function_type, path, B, max_shap_value_inputs, random_state, calculation_method, aggregation_method, timestamps, save_individual_explanations, **kwargs)
117 for i in tqdm(range(len(new_observations))):
118 survSHAP_obj = PredictSurvSHAP(
119 function_type=function_type,
120 path=path,
(...)
125 random_state=random_state,
126 )
--> 127 survSHAP_obj.fit(explainer, new_observations.iloc[[i]], timestamps)
128 if save_individual_explanations:
129 individual_explanations.append(survSHAP_obj)
File c:\Users\alenk\anaconda3\envs\azureml_py310_sdkv2\lib\site-packages\survshap\predict_explanations\object.py:81, in PredictSurvSHAP.fit(self, explainer, new_observation, timestamps, y_true)
72 self.y_true_time = y_true[names[1]]
74 if self.calculation_method == "kernel":
75 (
76 self.result,
77 self.predicted_function,
78 self.baseline_function,
79 self.timestamps,
80 self.r2,
---> 81 ) = shap_kernel(
82 explainer,
83 new_observation,
84 self.function,
85 self.aggregation_method,
86 timestamps,
87 self.max_shap_value_inputs,
88 )
89 elif self.calculation_method == "sampling":
90 (
91 self.result,
92 self.predicted_function,
(...)
104 self.exact,
105 )
File c:\Users\alenk\anaconda3\envs\azureml_py310_sdkv2\lib\site-packages\survshap\predict_explanations\utils.py:106, in shap_kernel(explainer, new_observation, function_type, aggregation_method, timestamps, max_shap_value_inputs)
101 print(
102 f"Approximate Survival Shapley will sample only {max_shap_value_inputs} values instead of 2**{p} for Exact Shapley"
103 )
105 kernel_weights = generate_shap_kernel_weights(simplified_inputs, p)
--> 106 shap_values, r2 = calculate_shap_values(
107 explainer,
108 function_type,
109 baseline_f,
110 explainer.data,
111 simplified_inputs,
112 kernel_weights,
113 new_observation,
114 timestamps,
115 )
117 variable_names = explainer.data.columns
118 result = prepare_result_df(new_observation, variable_names, shap_values, timestamps, aggregation_method)
File c:\Users\alenk\anaconda3\envs\azureml_py310_sdkv2\lib\site-packages\survshap\predict_explanations\utils.py:158, in calculate_shap_values(model, function_type, avg_function, data, simplified_inputs, shap_kernel_weights, new_observation, timestamps)
148 def calculate_shap_values(
149 model,
150 function_type,
(...)
156 timestamps,
157 ):
--> 158 W = np.diag(shap_kernel_weights)
159 X = np.array(simplified_inputs)
160 R = np.linalg.inv(X.T @ W @ X) @ (X.T @ W)
File c:\Users\alenk\anaconda3\envs\azureml_py310_sdkv2\lib\site-packages\numpy\lib\twodim_base.py:293, in diag(v, k)
291 if len(s) == 1:
292 n = s[0]+abs(k)
--> 293 res = zeros((n, n), v.dtype)
294 if k >= 0:
295 i = k
MemoryError: Unable to allocate 512. TiB for an array with shape (8388608, 8388608) and data type float64
Issue Description
I used a Random Survival Forest with 10 estimators and a max depth of 25 on approximately 1800 data samples. The full dataset otherwise contains approximately 200,000 data samples, but I intentionally only used a very small sample when I encountered this error.
When attempting to fit a ModelSurvSHAP on this very small dummy random survival forest I encounter the following error:
MemoryError: Unable to allocate 512. TiB for an array with shape (8388608, 8388608) and data type float64I'm using survshap version 0.4.2.
Minimal Reproducible Code Sample
Error Trace:
--------------------------------------------------------------------------- MemoryError Traceback (most recent call last) Cell In[38], line 6 3 rsf_exp = SurvivalModelExplainer(rsf, X_test, y_test) 5 exp1_survshap_global_rsf = ModelSurvSHAP(random_state=42) ----> 6 exp1_survshap_global_rsf.fit(rsf_exp) File c:\Users\alenk\anaconda3\envs\azureml_py310_sdkv2\lib\site-packages\survshap\model_explanations\object.py:76, in ModelSurvSHAP.fit(self, explainer, new_observations, timestamps, save_individual_explanations, **kwargs) 69 if new_observations is None: 70 new_observations = explainer.data 72 ( 73 self.full_result, 74 self.individual_explanations, 75 self.timestamps, ---> 76 ) = calculate_individual_explanations( 77 explainer, 78 new_observations, 79 self.function_type, 80 self.path, 81 self.B, 82 self.max_shap_value_inputs, 83 self.random_state, 84 self.calculation_method, 85 self.aggregation_method, 86 timestamps, 87 save_individual_explanations, 88 **kwargs 89 ) 91 names = explainer.y.dtype.names 92 self.event_ind = explainer.y[names[0]] File c:\Users\alenk\anaconda3\envs\azureml_py310_sdkv2\lib\site-packages\survshap\model_explanations\utils.py:127, in calculate_individual_explanations(explainer, new_observations, function_type, path, B, max_shap_value_inputs, random_state, calculation_method, aggregation_method, timestamps, save_individual_explanations, **kwargs) 117 for i in tqdm(range(len(new_observations))): 118 survSHAP_obj = PredictSurvSHAP( 119 function_type=function_type, 120 path=path, (...) 125 random_state=random_state, 126 ) --> 127 survSHAP_obj.fit(explainer, new_observations.iloc[[i]], timestamps) 128 if save_individual_explanations: 129 individual_explanations.append(survSHAP_obj) File c:\Users\alenk\anaconda3\envs\azureml_py310_sdkv2\lib\site-packages\survshap\predict_explanations\object.py:81, in PredictSurvSHAP.fit(self, explainer, new_observation, timestamps, y_true) 72 self.y_true_time = y_true[names[1]] 74 if self.calculation_method == "kernel": 75 ( 76 self.result, 77 self.predicted_function, 78 self.baseline_function, 79 self.timestamps, 80 self.r2, ---> 81 ) = shap_kernel( 82 explainer, 83 new_observation, 84 self.function, 85 self.aggregation_method, 86 timestamps, 87 self.max_shap_value_inputs, 88 ) 89 elif self.calculation_method == "sampling": 90 ( 91 self.result, 92 self.predicted_function, (...) 104 self.exact, 105 ) File c:\Users\alenk\anaconda3\envs\azureml_py310_sdkv2\lib\site-packages\survshap\predict_explanations\utils.py:106, in shap_kernel(explainer, new_observation, function_type, aggregation_method, timestamps, max_shap_value_inputs) 101 print( 102 f"Approximate Survival Shapley will sample only {max_shap_value_inputs} values instead of 2**{p} for Exact Shapley" 103 ) 105 kernel_weights = generate_shap_kernel_weights(simplified_inputs, p) --> 106 shap_values, r2 = calculate_shap_values( 107 explainer, 108 function_type, 109 baseline_f, 110 explainer.data, 111 simplified_inputs, 112 kernel_weights, 113 new_observation, 114 timestamps, 115 ) 117 variable_names = explainer.data.columns 118 result = prepare_result_df(new_observation, variable_names, shap_values, timestamps, aggregation_method) File c:\Users\alenk\anaconda3\envs\azureml_py310_sdkv2\lib\site-packages\survshap\predict_explanations\utils.py:158, in calculate_shap_values(model, function_type, avg_function, data, simplified_inputs, shap_kernel_weights, new_observation, timestamps) 148 def calculate_shap_values( 149 model, 150 function_type, (...) 156 timestamps, 157 ): --> 158 W = np.diag(shap_kernel_weights) 159 X = np.array(simplified_inputs) 160 R = np.linalg.inv(X.T @ W @ X) @ (X.T @ W) File c:\Users\alenk\anaconda3\envs\azureml_py310_sdkv2\lib\site-packages\numpy\lib\twodim_base.py:293, in diag(v, k) 291 if len(s) == 1: 292 n = s[0]+abs(k) --> 293 res = zeros((n, n), v.dtype) 294 if k >= 0: 295 i = k MemoryError: Unable to allocate 512. TiB for an array with shape (8388608, 8388608) and data type float64