Skip to content

Commit 23075d6

Browse files
committed
move ensemble arguments to search function
1 parent fdac86c commit 23075d6

File tree

5 files changed

+81
-74
lines changed

5 files changed

+81
-74
lines changed

autoPyTorch/api/base_task.py

Lines changed: 43 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -123,16 +123,6 @@ class BaseTask(ABC):
123123
Number of threads to use for each process.
124124
logging_config (Optional[Dict]):
125125
Specifies configuration for logging, if None, it is loaded from the logging.yaml
126-
ensemble_size (int: default=50):
127-
Number of models added to the ensemble built by
128-
Ensemble selection from libraries of models.
129-
Models are drawn with replacement.
130-
ensemble_nbest (int: default=50):
131-
Only consider the ensemble_nbest models to build the ensemble
132-
max_models_on_disc (int: default=50):
133-
Maximum number of models saved to disc. It also controls the size of
134-
the ensemble as any additional models will be deleted.
135-
Must be greater than or equal to 1.
136126
temporary_directory (str):
137127
Folder to store configuration output and log file
138128
output_directory (str):
@@ -168,9 +158,6 @@ def __init__(
168158
n_jobs: int = 1,
169159
n_threads: int = 1,
170160
logging_config: Optional[Dict] = None,
171-
ensemble_size: int = 50,
172-
ensemble_nbest: int = 50,
173-
max_models_on_disc: int = 50,
174161
temporary_directory: Optional[str] = None,
175162
output_directory: Optional[str] = None,
176163
delete_tmp_folder_after_terminate: bool = True,
@@ -190,9 +177,6 @@ def __init__(
190177
self.seed = seed
191178
self.n_jobs = n_jobs
192179
self.n_threads = n_threads
193-
self.ensemble_size = ensemble_size
194-
self.ensemble_nbest = ensemble_nbest
195-
self.max_models_on_disc = max_models_on_disc
196180
self.logging_config: Optional[Dict] = logging_config
197181
self.include_components: Optional[Dict] = include_components
198182
self.exclude_components: Optional[Dict] = exclude_components
@@ -925,7 +909,10 @@ def _search(
925909
disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
926910
load_models: bool = True,
927911
portfolio_selection: Optional[str] = None,
928-
dask_client: Optional[dask.distributed.Client] = None
912+
dask_client: Optional[dask.distributed.Client] = None,
913+
ensemble_size: int = 50,
914+
ensemble_nbest: int = 50,
915+
max_models_on_disc: int = 50,
929916
) -> 'BaseTask':
930917
"""
931918
Search for the best pipeline configuration for the given dataset.
@@ -1053,6 +1040,16 @@ def _search(
10531040
Additionally, the keyword 'greedy' is supported,
10541041
which would use the default portfolio from
10551042
`AutoPyTorch Tabular <https://arxiv.org/abs/2006.13799>`_
1043+
ensemble_size (int: default=50):
1044+
Number of models added to the ensemble built by
1045+
Ensemble selection from libraries of models.
1046+
Models are drawn with replacement.
1047+
ensemble_nbest (int: default=50):
1048+
Only consider the ensemble_nbest models to build the ensemble
1049+
max_models_on_disc (int: default=50):
1050+
Maximum number of models saved to disc. It also controls the size of
1051+
the ensemble as any additional models will be deleted.
1052+
Must be greater than or equal to 1.
10561053
10571054
Returns:
10581055
self
@@ -1086,13 +1083,14 @@ def _search(
10861083
self._disable_file_output = disable_file_output if disable_file_output is not None else []
10871084
if (
10881085
DisableFileOutputParameters.y_optimization in self._disable_file_output
1089-
and self.ensemble_size > 1
1086+
and ensemble_size > 1
10901087
):
10911088
self._logger.warning(f"No ensemble will be created when {DisableFileOutputParameters.y_optimization}"
10921089
f" is in disable_file_output")
10931090

10941091
self._memory_limit = memory_limit
10951092
self._time_for_task = total_walltime_limit
1093+
10961094
# Save start time to backend
10971095
self._backend.save_start_time(str(self.seed))
10981096

@@ -1153,7 +1151,7 @@ def _search(
11531151

11541152
# Make sure that at least 2 models are created for the ensemble process
11551153
num_models = time_left_for_modelfit // func_eval_time_limit_secs
1156-
if num_models < 2 and self.ensemble_size > 0:
1154+
if num_models < 2 and ensemble_size > 0:
11571155
func_eval_time_limit_secs = time_left_for_modelfit // 2
11581156
self._logger.warning(
11591157
"Capping the func_eval_time_limit_secs to {} to have "
@@ -1164,7 +1162,7 @@ def _search(
11641162

11651163
# ============> Run dummy predictions
11661164
# We only want to run dummy predictions in case we want to build an ensemble
1167-
if self.ensemble_size > 0:
1165+
if ensemble_size > 0:
11681166
dummy_task_name = 'runDummy'
11691167
self._stopwatch.start_task(dummy_task_name)
11701168
self._do_dummy_prediction()
@@ -1173,7 +1171,7 @@ def _search(
11731171
# ============> Run traditional ml
11741172
# We only want to run traditional predictions in case we want to build an ensemble
11751173
# We want time for at least 1 Neural network in SMAC
1176-
if enable_traditional_pipeline and self.ensemble_size > 0:
1174+
if enable_traditional_pipeline and ensemble_size > 0:
11771175
traditional_runtime_limit = int(self._time_for_task - func_eval_time_limit_secs)
11781176
self.run_traditional_ml(current_task_name=self.dataset_name,
11791177
runtime_limit=traditional_runtime_limit,
@@ -1188,21 +1186,22 @@ def _search(
11881186
if time_left_for_ensembles <= 0:
11891187
# Fit only raises error when ensemble_size is not zero but
11901188
# time_left_for_ensembles is zero.
1191-
if self.ensemble_size > 0:
1189+
if ensemble_size > 0:
11921190
raise ValueError("Not starting ensemble builder because there "
11931191
"is no time left. Try increasing the value "
11941192
"of time_left_for_this_task.")
1195-
elif self.ensemble_size <= 0:
1193+
elif ensemble_size <= 0:
11961194
self._logger.info("Not starting ensemble builder as ensemble size is 0")
11971195
else:
11981196
self._logger.info("Starting ensemble")
11991197
ensemble_task_name = 'ensemble'
12001198
self._stopwatch.start_task(ensemble_task_name)
12011199
proc_ensemble = self._init_ensemble_builder(time_left_for_ensembles=time_left_for_ensembles,
1202-
ensemble_size=self.ensemble_size,
1203-
ensemble_nbest=self.ensemble_nbest,
1200+
ensemble_size=ensemble_size,
1201+
ensemble_nbest=ensemble_nbest,
12041202
precision=precision,
1205-
optimize_metric=self.opt_metric
1203+
optimize_metric=self.opt_metric,
1204+
max_models_on_disc=max_models_on_disc
12061205
)
12071206
self._stopwatch.stop_task(ensemble_task_name)
12081207

@@ -1662,6 +1661,7 @@ def fit_ensemble(
16621661
precision: Optional[int] = None,
16631662
ensemble_nbest: int = 50,
16641663
ensemble_size: int = 50,
1664+
max_models_on_disc: int = 50,
16651665
load_models: bool = True,
16661666
time_for_task: int = 100,
16671667
func_eval_time_limit_secs: int = 50,
@@ -1677,13 +1677,16 @@ def fit_ensemble(
16771677
evaluate a pipeline. if not specified, value passed to search will be used
16781678
precision (Optional[int]): Numeric precision used when loading
16791679
ensemble data. Can be either 16, 32 or 64.
1680-
ensemble_nbest (Optional[int]):
1681-
only consider the ensemble_nbest models to build the ensemble.
1682-
If None, uses the value stored in class attribute `ensemble_nbest`.
1683-
ensemble_size (int) (default=50):
1680+
ensemble_size (int: default=50):
16841681
Number of models added to the ensemble built by
16851682
Ensemble selection from libraries of models.
16861683
Models are drawn with replacement.
1684+
ensemble_nbest (int: default=50):
1685+
Only consider the ensemble_nbest models to build the ensemble
1686+
max_models_on_disc (int: default=50):
1687+
Maximum number of models saved to disc. It also controls the size of
1688+
the ensemble as any additional models will be deleted.
1689+
Must be greater than or equal to 1.
16871690
enable_traditional_pipeline (bool), (default=True):
16881691
We fit traditional machine learning algorithms
16891692
(LightGBM, CatBoost, RandomForest, ExtraTrees, KNN, SVM)
@@ -1772,6 +1775,7 @@ def fit_ensemble(
17721775
precision=precision,
17731776
ensemble_size=ensemble_size,
17741777
ensemble_nbest=ensemble_nbest,
1778+
max_models_on_disc=max_models_on_disc
17751779
)
17761780

17771781
manager.build_ensemble(self._dask_client)
@@ -1793,6 +1797,7 @@ def _init_ensemble_builder(
17931797
optimize_metric: str,
17941798
ensemble_nbest: int,
17951799
ensemble_size: int,
1800+
max_models_on_disc: int = 50,
17961801
precision: int = 32,
17971802
) -> EnsembleBuilderManager:
17981803
"""
@@ -1802,13 +1807,17 @@ def _init_ensemble_builder(
18021807
Time (in seconds) allocated to building the ensemble
18031808
optimize_metric (str):
18041809
Name of the metric to optimize the ensemble.
1805-
ensemble_nbest (int):
1806-
only consider the ensemble_nbest models to build the ensemble.
18071810
ensemble_size (int):
18081811
Number of models added to the ensemble built by
18091812
Ensemble selection from libraries of models.
18101813
Models are drawn with replacement.
1811-
precision (int), (default=32): Numeric precision used when loading
1814+
ensemble_nbest (int):
1815+
Only consider the ensemble_nbest models to build the ensemble
1816+
max_models_on_disc (int: default=50):
1817+
Maximum number of models saved to disc. It also controls the size of
1818+
the ensemble as any additional models will be deleted.
1819+
Must be greater than or equal to 1.
1820+
precision (int: default=32): Numeric precision used when loading
18121821
ensemble data. Can be either 16, 32 or 64.
18131822
18141823
Returns:
@@ -1842,7 +1851,7 @@ def _init_ensemble_builder(
18421851
opt_metric=optimize_metric,
18431852
ensemble_size=ensemble_size,
18441853
ensemble_nbest=ensemble_nbest,
1845-
max_models_on_disc=self.max_models_on_disc,
1854+
max_models_on_disc=max_models_on_disc,
18461855
seed=self.seed,
18471856
max_iterations=None,
18481857
read_at_most=sys.maxsize,

autoPyTorch/api/tabular_classification.py

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -35,18 +35,6 @@ class TabularClassificationTask(BaseTask):
3535
number of threads to use for each process.
3636
logging_config (Optional[Dict]):
3737
Specifies configuration for logging, if None, it is loaded from the logging.yaml
38-
ensemble_size (int: default=50):
39-
Number of models added to the ensemble built by
40-
Ensemble selection from libraries of models.
41-
Models are drawn with replacement.
42-
ensemble_nbest (int: default=50):
43-
Only consider the ensemble_nbest
44-
models to build the ensemble
45-
max_models_on_disc (int: default=50):
46-
Maximum number of models saved to disc.
47-
Also, controls the size of the ensemble
48-
as any additional models will be deleted.
49-
Must be greater than or equal to 1.
5038
temporary_directory (str):
5139
Folder to store configuration output and log file
5240
output_directory (str):
@@ -81,9 +69,6 @@ def __init__(
8169
n_jobs: int = 1,
8270
n_threads: int = 1,
8371
logging_config: Optional[Dict] = None,
84-
ensemble_size: int = 50,
85-
ensemble_nbest: int = 50,
86-
max_models_on_disc: int = 50,
8772
temporary_directory: Optional[str] = None,
8873
output_directory: Optional[str] = None,
8974
delete_tmp_folder_after_terminate: bool = True,
@@ -100,9 +85,6 @@ def __init__(
10085
n_jobs=n_jobs,
10186
n_threads=n_threads,
10287
logging_config=logging_config,
103-
ensemble_size=ensemble_size,
104-
ensemble_nbest=ensemble_nbest,
105-
max_models_on_disc=max_models_on_disc,
10688
temporary_directory=temporary_directory,
10789
output_directory=output_directory,
10890
delete_tmp_folder_after_terminate=delete_tmp_folder_after_terminate,
@@ -242,6 +224,9 @@ def search(
242224
disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
243225
load_models: bool = True,
244226
portfolio_selection: Optional[str] = None,
227+
ensemble_size: int = 50,
228+
ensemble_nbest: int = 50,
229+
max_models_on_disc: int = 50,
245230
) -> 'BaseTask':
246231
"""
247232
Search for the best pipeline configuration for the given dataset.
@@ -368,6 +353,18 @@ def search(
368353
Additionally, the keyword 'greedy' is supported,
369354
which would use the default portfolio from
370355
`AutoPyTorch Tabular <https://arxiv.org/abs/2006.13799>`_.
356+
ensemble_size (int: default=50):
357+
Number of models added to the ensemble built by
358+
Ensemble selection from libraries of models.
359+
Models are drawn with replacement.
360+
ensemble_nbest (int: default=50):
361+
Only consider the ensemble_nbest
362+
models to build the ensemble
363+
max_models_on_disc (int: default=50):
364+
Maximum number of models saved to disc.
365+
Also, controls the size of the ensemble
366+
as any additional models will be deleted.
367+
Must be greater than or equal to 1.
371368
372369
Returns:
373370
self
@@ -400,6 +397,9 @@ def search(
400397
disable_file_output=disable_file_output,
401398
load_models=load_models,
402399
portfolio_selection=portfolio_selection,
400+
ensemble_size=ensemble_size,
401+
ensemble_nbest=ensemble_nbest,
402+
max_models_on_disc=max_models_on_disc,
403403
)
404404

405405
def predict(

autoPyTorch/api/tabular_regression.py

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -35,18 +35,6 @@ class TabularRegressionTask(BaseTask):
3535
number of threads to use for each process.
3636
logging_config (Optional[Dict]):
3737
Specifies configuration for logging, if None, it is loaded from the logging.yaml
38-
ensemble_size (int: default=50):
39-
Number of models added to the ensemble built by
40-
Ensemble selection from libraries of models.
41-
Models are drawn with replacement.
42-
ensemble_nbest (int: default=50):
43-
Only consider the ensemble_nbest
44-
models to build the ensemble
45-
max_models_on_disc (int: default=50):
46-
Maximum number of models saved to disc.
47-
Also, controls the size of the ensemble
48-
as any additional models will be deleted.
49-
Must be greater than or equal to 1.
5038
temporary_directory (str):
5139
Folder to store configuration output and log file
5240
output_directory (str):
@@ -82,9 +70,6 @@ def __init__(
8270
n_jobs: int = 1,
8371
n_threads: int = 1,
8472
logging_config: Optional[Dict] = None,
85-
ensemble_size: int = 50,
86-
ensemble_nbest: int = 50,
87-
max_models_on_disc: int = 50,
8873
temporary_directory: Optional[str] = None,
8974
output_directory: Optional[str] = None,
9075
delete_tmp_folder_after_terminate: bool = True,
@@ -101,9 +86,6 @@ def __init__(
10186
n_jobs=n_jobs,
10287
n_threads=n_threads,
10388
logging_config=logging_config,
104-
ensemble_size=ensemble_size,
105-
ensemble_nbest=ensemble_nbest,
106-
max_models_on_disc=max_models_on_disc,
10789
temporary_directory=temporary_directory,
10890
output_directory=output_directory,
10991
delete_tmp_folder_after_terminate=delete_tmp_folder_after_terminate,
@@ -243,6 +225,9 @@ def search(
243225
disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
244226
load_models: bool = True,
245227
portfolio_selection: Optional[str] = None,
228+
ensemble_size: int = 50,
229+
ensemble_nbest: int = 50,
230+
max_models_on_disc: int = 50,
246231
) -> 'BaseTask':
247232
"""
248233
Search for the best pipeline configuration for the given dataset.
@@ -369,6 +354,18 @@ def search(
369354
Additionally, the keyword 'greedy' is supported,
370355
which would use the default portfolio from
371356
`AutoPyTorch Tabular <https://arxiv.org/abs/2006.13799>`_.
357+
ensemble_size (int: default=50):
358+
Number of models added to the ensemble built by
359+
Ensemble selection from libraries of models.
360+
Models are drawn with replacement.
361+
ensemble_nbest (int: default=50):
362+
Only consider the ensemble_nbest
363+
models to build the ensemble
364+
max_models_on_disc (int: default=50):
365+
Maximum number of models saved to disc.
366+
Also, controls the size of the ensemble
367+
as any additional models will be deleted.
368+
Must be greater than or equal to 1.
372369
373370
Returns:
374371
self
@@ -400,6 +397,9 @@ def search(
400397
disable_file_output=disable_file_output,
401398
load_models=load_models,
402399
portfolio_selection=portfolio_selection,
400+
ensemble_size=ensemble_size,
401+
ensemble_nbest=ensemble_nbest,
402+
max_models_on_disc=max_models_on_disc,
403403
)
404404

405405
def predict(

test/test_api/test_api.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -579,7 +579,6 @@ def test_do_traditional_pipeline(fit_dictionary_tabular):
579579
estimator = TabularClassificationTask(
580580
backend=backend,
581581
resampling_strategy=HoldoutValTypes.holdout_validation,
582-
ensemble_size=0,
583582
)
584583

585584
# Setup pre-requisites normally set by search()

test/test_api/test_base_api.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -189,7 +189,6 @@ def test_init_ensemble_builder(backend):
189189
assert proc_ensemble.opt_metric == 'accuracy'
190190
assert proc_ensemble.metrics[0] == accuracy
191191

192-
estimator._close_dask_client()
193-
estimator._clean_logger()
192+
estimator._cleanup()
194193

195-
del estimator
194+
del estimator

0 commit comments

Comments
 (0)