@@ -123,16 +123,6 @@ class BaseTask(ABC):
123123 Number of threads to use for each process.
124124 logging_config (Optional[Dict]):
125125 Specifies configuration for logging, if None, it is loaded from the logging.yaml
126- ensemble_size (int: default=50):
127- Number of models added to the ensemble built by
128- Ensemble selection from libraries of models.
129- Models are drawn with replacement.
130- ensemble_nbest (int: default=50):
131- Only consider the ensemble_nbest models to build the ensemble
132- max_models_on_disc (int: default=50):
133- Maximum number of models saved to disc. It also controls the size of
134- the ensemble as any additional models will be deleted.
135- Must be greater than or equal to 1.
136126 temporary_directory (str):
137127 Folder to store configuration output and log file
138128 output_directory (str):
@@ -168,9 +158,6 @@ def __init__(
168158 n_jobs : int = 1 ,
169159 n_threads : int = 1 ,
170160 logging_config : Optional [Dict ] = None ,
171- ensemble_size : int = 50 ,
172- ensemble_nbest : int = 50 ,
173- max_models_on_disc : int = 50 ,
174161 temporary_directory : Optional [str ] = None ,
175162 output_directory : Optional [str ] = None ,
176163 delete_tmp_folder_after_terminate : bool = True ,
@@ -190,9 +177,6 @@ def __init__(
190177 self .seed = seed
191178 self .n_jobs = n_jobs
192179 self .n_threads = n_threads
193- self .ensemble_size = ensemble_size
194- self .ensemble_nbest = ensemble_nbest
195- self .max_models_on_disc = max_models_on_disc
196180 self .logging_config : Optional [Dict ] = logging_config
197181 self .include_components : Optional [Dict ] = include_components
198182 self .exclude_components : Optional [Dict ] = exclude_components
@@ -925,7 +909,10 @@ def _search(
925909 disable_file_output : Optional [List [Union [str , DisableFileOutputParameters ]]] = None ,
926910 load_models : bool = True ,
927911 portfolio_selection : Optional [str ] = None ,
928- dask_client : Optional [dask .distributed .Client ] = None
912+ dask_client : Optional [dask .distributed .Client ] = None ,
913+ ensemble_size : int = 50 ,
914+ ensemble_nbest : int = 50 ,
915+ max_models_on_disc : int = 50 ,
929916 ) -> 'BaseTask' :
930917 """
931918 Search for the best pipeline configuration for the given dataset.
@@ -1053,6 +1040,16 @@ def _search(
10531040 Additionally, the keyword 'greedy' is supported,
10541041 which would use the default portfolio from
10551042 `AutoPyTorch Tabular <https://arxiv.org/abs/2006.13799>`_
1043+ ensemble_size (int: default=50):
1044+ Number of models added to the ensemble built by
1045+ Ensemble selection from libraries of models.
1046+ Models are drawn with replacement.
1047+ ensemble_nbest (int: default=50):
1048+ Only consider the ensemble_nbest models to build the ensemble
1049+ max_models_on_disc (int: default=50):
1050+ Maximum number of models saved to disc. It also controls the size of
1051+ the ensemble as any additional models will be deleted.
1052+ Must be greater than or equal to 1.
10561053
10571054 Returns:
10581055 self
@@ -1086,13 +1083,14 @@ def _search(
10861083 self ._disable_file_output = disable_file_output if disable_file_output is not None else []
10871084 if (
10881085 DisableFileOutputParameters .y_optimization in self ._disable_file_output
1089- and self . ensemble_size > 1
1086+ and ensemble_size > 1
10901087 ):
10911088 self ._logger .warning (f"No ensemble will be created when { DisableFileOutputParameters .y_optimization } "
10921089 f" is in disable_file_output" )
10931090
10941091 self ._memory_limit = memory_limit
10951092 self ._time_for_task = total_walltime_limit
1093+
10961094 # Save start time to backend
10971095 self ._backend .save_start_time (str (self .seed ))
10981096
@@ -1153,7 +1151,7 @@ def _search(
11531151
11541152 # Make sure that at least 2 models are created for the ensemble process
11551153 num_models = time_left_for_modelfit // func_eval_time_limit_secs
1156- if num_models < 2 and self . ensemble_size > 0 :
1154+ if num_models < 2 and ensemble_size > 0 :
11571155 func_eval_time_limit_secs = time_left_for_modelfit // 2
11581156 self ._logger .warning (
11591157 "Capping the func_eval_time_limit_secs to {} to have "
@@ -1164,7 +1162,7 @@ def _search(
11641162
11651163 # ============> Run dummy predictions
11661164 # We only want to run dummy predictions in case we want to build an ensemble
1167- if self . ensemble_size > 0 :
1165+ if ensemble_size > 0 :
11681166 dummy_task_name = 'runDummy'
11691167 self ._stopwatch .start_task (dummy_task_name )
11701168 self ._do_dummy_prediction ()
@@ -1173,7 +1171,7 @@ def _search(
11731171 # ============> Run traditional ml
11741172 # We only want to run traditional predictions in case we want to build an ensemble
11751173 # We want time for at least 1 Neural network in SMAC
1176- if enable_traditional_pipeline and self . ensemble_size > 0 :
1174+ if enable_traditional_pipeline and ensemble_size > 0 :
11771175 traditional_runtime_limit = int (self ._time_for_task - func_eval_time_limit_secs )
11781176 self .run_traditional_ml (current_task_name = self .dataset_name ,
11791177 runtime_limit = traditional_runtime_limit ,
@@ -1188,21 +1186,22 @@ def _search(
11881186 if time_left_for_ensembles <= 0 :
11891187 # Fit only raises error when ensemble_size is not zero but
11901188 # time_left_for_ensembles is zero.
1191- if self . ensemble_size > 0 :
1189+ if ensemble_size > 0 :
11921190 raise ValueError ("Not starting ensemble builder because there "
11931191 "is no time left. Try increasing the value "
11941192 "of time_left_for_this_task." )
1195- elif self . ensemble_size <= 0 :
1193+ elif ensemble_size <= 0 :
11961194 self ._logger .info ("Not starting ensemble builder as ensemble size is 0" )
11971195 else :
11981196 self ._logger .info ("Starting ensemble" )
11991197 ensemble_task_name = 'ensemble'
12001198 self ._stopwatch .start_task (ensemble_task_name )
12011199 proc_ensemble = self ._init_ensemble_builder (time_left_for_ensembles = time_left_for_ensembles ,
1202- ensemble_size = self . ensemble_size ,
1203- ensemble_nbest = self . ensemble_nbest ,
1200+ ensemble_size = ensemble_size ,
1201+ ensemble_nbest = ensemble_nbest ,
12041202 precision = precision ,
1205- optimize_metric = self .opt_metric
1203+ optimize_metric = self .opt_metric ,
1204+ max_models_on_disc = max_models_on_disc
12061205 )
12071206 self ._stopwatch .stop_task (ensemble_task_name )
12081207
@@ -1662,6 +1661,7 @@ def fit_ensemble(
16621661 precision : Optional [int ] = None ,
16631662 ensemble_nbest : int = 50 ,
16641663 ensemble_size : int = 50 ,
1664+ max_models_on_disc : int = 50 ,
16651665 load_models : bool = True ,
16661666 time_for_task : int = 100 ,
16671667 func_eval_time_limit_secs : int = 50 ,
@@ -1677,13 +1677,16 @@ def fit_ensemble(
16771677 evaluate a pipeline. if not specified, value passed to search will be used
16781678 precision (Optional[int]): Numeric precision used when loading
16791679 ensemble data. Can be either 16, 32 or 64.
1680- ensemble_nbest (Optional[int]):
1681- only consider the ensemble_nbest models to build the ensemble.
1682- If None, uses the value stored in class attribute `ensemble_nbest`.
1683- ensemble_size (int) (default=50):
1680+ ensemble_size (int: default=50):
16841681 Number of models added to the ensemble built by
16851682 Ensemble selection from libraries of models.
16861683 Models are drawn with replacement.
1684+ ensemble_nbest (int: default=50):
1685+ Only consider the ensemble_nbest models to build the ensemble
1686+ max_models_on_disc (int: default=50):
1687+ Maximum number of models saved to disc. It also controls the size of
1688+ the ensemble as any additional models will be deleted.
1689+ Must be greater than or equal to 1.
16871690 enable_traditional_pipeline (bool), (default=True):
16881691 We fit traditional machine learning algorithms
16891692 (LightGBM, CatBoost, RandomForest, ExtraTrees, KNN, SVM)
@@ -1772,6 +1775,7 @@ def fit_ensemble(
17721775 precision = precision ,
17731776 ensemble_size = ensemble_size ,
17741777 ensemble_nbest = ensemble_nbest ,
1778+ max_models_on_disc = max_models_on_disc
17751779 )
17761780
17771781 manager .build_ensemble (self ._dask_client )
@@ -1793,6 +1797,7 @@ def _init_ensemble_builder(
17931797 optimize_metric : str ,
17941798 ensemble_nbest : int ,
17951799 ensemble_size : int ,
1800+ max_models_on_disc : int = 50 ,
17961801 precision : int = 32 ,
17971802 ) -> EnsembleBuilderManager :
17981803 """
@@ -1802,13 +1807,17 @@ def _init_ensemble_builder(
18021807 Time (in seconds) allocated to building the ensemble
18031808 optimize_metric (str):
18041809 Name of the metric to optimize the ensemble.
1805- ensemble_nbest (int):
1806- only consider the ensemble_nbest models to build the ensemble.
18071810 ensemble_size (int):
18081811 Number of models added to the ensemble built by
18091812 Ensemble selection from libraries of models.
18101813 Models are drawn with replacement.
1811- precision (int), (default=32): Numeric precision used when loading
1814+ ensemble_nbest (int):
1815+ Only consider the ensemble_nbest models to build the ensemble
1816+ max_models_on_disc (int: default=50):
1817+ Maximum number of models saved to disc. It also controls the size of
1818+ the ensemble as any additional models will be deleted.
1819+ Must be greater than or equal to 1.
1820+ precision (int: default=32): Numeric precision used when loading
18121821 ensemble data. Can be either 16, 32 or 64.
18131822
18141823 Returns:
@@ -1842,7 +1851,7 @@ def _init_ensemble_builder(
18421851 opt_metric = optimize_metric ,
18431852 ensemble_size = ensemble_size ,
18441853 ensemble_nbest = ensemble_nbest ,
1845- max_models_on_disc = self . max_models_on_disc ,
1854+ max_models_on_disc = max_models_on_disc ,
18461855 seed = self .seed ,
18471856 max_iterations = None ,
18481857 read_at_most = sys .maxsize ,
0 commit comments