Skip to content

Commit 4d16352

Browse files
committed
move ensemble arguments to search function
1 parent 667e2c6 commit 4d16352

File tree

5 files changed

+80
-73
lines changed

5 files changed

+80
-73
lines changed

autoPyTorch/api/base_task.py

Lines changed: 43 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -118,16 +118,6 @@ class BaseTask(ABC):
118118
Number of threads to use for each process.
119119
logging_config (Optional[Dict]):
120120
Specifies configuration for logging, if None, it is loaded from the logging.yaml
121-
ensemble_size (int: default=50):
122-
Number of models added to the ensemble built by
123-
Ensemble selection from libraries of models.
124-
Models are drawn with replacement.
125-
ensemble_nbest (int: default=50):
126-
Only consider the ensemble_nbest models to build the ensemble
127-
max_models_on_disc (int: default=50):
128-
Maximum number of models saved to disc. It also controls the size of
129-
the ensemble as any additional models will be deleted.
130-
Must be greater than or equal to 1.
131121
temporary_directory (str):
132122
Folder to store configuration output and log file
133123
output_directory (str):
@@ -156,9 +146,6 @@ def __init__(
156146
n_jobs: int = 1,
157147
n_threads: int = 1,
158148
logging_config: Optional[Dict] = None,
159-
ensemble_size: int = 50,
160-
ensemble_nbest: int = 50,
161-
max_models_on_disc: int = 50,
162149
temporary_directory: Optional[str] = None,
163150
output_directory: Optional[str] = None,
164151
delete_tmp_folder_after_terminate: bool = True,
@@ -174,9 +161,6 @@ def __init__(
174161
self.seed = seed
175162
self.n_jobs = n_jobs
176163
self.n_threads = n_threads
177-
self.ensemble_size = ensemble_size
178-
self.ensemble_nbest = ensemble_nbest
179-
self.max_models_on_disc = max_models_on_disc
180164
self.logging_config: Optional[Dict] = logging_config
181165
self.include_components: Optional[Dict] = include_components
182166
self.exclude_components: Optional[Dict] = exclude_components
@@ -909,7 +893,10 @@ def _search(
909893
disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
910894
load_models: bool = True,
911895
portfolio_selection: Optional[str] = None,
912-
dask_client: Optional[dask.distributed.Client] = None
896+
dask_client: Optional[dask.distributed.Client] = None,
897+
ensemble_size: int = 50,
898+
ensemble_nbest: int = 50,
899+
max_models_on_disc: int = 50,
913900
) -> 'BaseTask':
914901
"""
915902
Search for the best pipeline configuration for the given dataset.
@@ -1037,6 +1024,16 @@ def _search(
10371024
Additionally, the keyword 'greedy' is supported,
10381025
which would use the default portfolio from
10391026
`AutoPyTorch Tabular <https://arxiv.org/abs/2006.13799>`_
1027+
ensemble_size (int: default=50):
1028+
Number of models added to the ensemble built by
1029+
Ensemble selection from libraries of models.
1030+
Models are drawn with replacement.
1031+
ensemble_nbest (int: default=50):
1032+
Only consider the ensemble_nbest models to build the ensemble
1033+
max_models_on_disc (int: default=50):
1034+
Maximum number of models saved to disc. It also controls the size of
1035+
the ensemble as any additional models will be deleted.
1036+
Must be greater than or equal to 1.
10401037
10411038
Returns:
10421039
self
@@ -1070,13 +1067,14 @@ def _search(
10701067
self._disable_file_output = disable_file_output if disable_file_output is not None else []
10711068
if (
10721069
DisableFileOutputParameters.y_optimization in self._disable_file_output
1073-
and self.ensemble_size > 1
1070+
and ensemble_size > 1
10741071
):
10751072
self._logger.warning(f"No ensemble will be created when {DisableFileOutputParameters.y_optimization}"
10761073
f" is in disable_file_output")
10771074

10781075
self._memory_limit = memory_limit
10791076
self._time_for_task = total_walltime_limit
1077+
10801078
# Save start time to backend
10811079
self._backend.save_start_time(str(self.seed))
10821080

@@ -1137,7 +1135,7 @@ def _search(
11371135

11381136
# Make sure that at least 2 models are created for the ensemble process
11391137
num_models = time_left_for_modelfit // func_eval_time_limit_secs
1140-
if num_models < 2 and self.ensemble_size > 0:
1138+
if num_models < 2 and ensemble_size > 0:
11411139
func_eval_time_limit_secs = time_left_for_modelfit // 2
11421140
self._logger.warning(
11431141
"Capping the func_eval_time_limit_secs to {} to have "
@@ -1148,7 +1146,7 @@ def _search(
11481146

11491147
# ============> Run dummy predictions
11501148
# We only want to run dummy predictions in case we want to build an ensemble
1151-
if self.ensemble_size > 0:
1149+
if ensemble_size > 0:
11521150
dummy_task_name = 'runDummy'
11531151
self._stopwatch.start_task(dummy_task_name)
11541152
self._do_dummy_prediction()
@@ -1157,7 +1155,7 @@ def _search(
11571155
# ============> Run traditional ml
11581156
# We only want to run traditional predictions in case we want to build an ensemble
11591157
# We want time for at least 1 Neural network in SMAC
1160-
if enable_traditional_pipeline and self.ensemble_size > 0:
1158+
if enable_traditional_pipeline and ensemble_size > 0:
11611159
traditional_runtime_limit = int(self._time_for_task - func_eval_time_limit_secs)
11621160
self.run_traditional_ml(current_task_name=self.dataset_name,
11631161
runtime_limit=traditional_runtime_limit,
@@ -1172,21 +1170,22 @@ def _search(
11721170
if time_left_for_ensembles <= 0:
11731171
# Fit only raises error when ensemble_size is not zero but
11741172
# time_left_for_ensembles is zero.
1175-
if self.ensemble_size > 0:
1173+
if ensemble_size > 0:
11761174
raise ValueError("Not starting ensemble builder because there "
11771175
"is no time left. Try increasing the value "
11781176
"of time_left_for_this_task.")
1179-
elif self.ensemble_size <= 0:
1177+
elif ensemble_size <= 0:
11801178
self._logger.info("Not starting ensemble builder as ensemble size is 0")
11811179
else:
11821180
self._logger.info("Starting ensemble")
11831181
ensemble_task_name = 'ensemble'
11841182
self._stopwatch.start_task(ensemble_task_name)
11851183
proc_ensemble = self._init_ensemble_builder(time_left_for_ensembles=time_left_for_ensembles,
1186-
ensemble_size=self.ensemble_size,
1187-
ensemble_nbest=self.ensemble_nbest,
1184+
ensemble_size=ensemble_size,
1185+
ensemble_nbest=ensemble_nbest,
11881186
precision=precision,
1189-
optimize_metric=self.opt_metric
1187+
optimize_metric=self.opt_metric,
1188+
max_models_on_disc=max_models_on_disc
11901189
)
11911190
self._stopwatch.stop_task(ensemble_task_name)
11921191

@@ -1646,6 +1645,7 @@ def fit_ensemble(
16461645
precision: Optional[int] = None,
16471646
ensemble_nbest: int = 50,
16481647
ensemble_size: int = 50,
1648+
max_models_on_disc: int = 50,
16491649
load_models: bool = True,
16501650
time_for_task: int = 100,
16511651
func_eval_time_limit_secs: int = 50,
@@ -1661,13 +1661,16 @@ def fit_ensemble(
16611661
evaluate a pipeline. if not specified, value passed to search will be used
16621662
precision (Optional[int]): Numeric precision used when loading
16631663
ensemble data. Can be either 16, 32 or 64.
1664-
ensemble_nbest (Optional[int]):
1665-
only consider the ensemble_nbest models to build the ensemble.
1666-
If None, uses the value stored in class attribute `ensemble_nbest`.
1667-
ensemble_size (int) (default=50):
1664+
ensemble_size (int: default=50):
16681665
Number of models added to the ensemble built by
16691666
Ensemble selection from libraries of models.
16701667
Models are drawn with replacement.
1668+
ensemble_nbest (int: default=50):
1669+
Only consider the ensemble_nbest models to build the ensemble
1670+
max_models_on_disc (int: default=50):
1671+
Maximum number of models saved to disc. It also controls the size of
1672+
the ensemble as any additional models will be deleted.
1673+
Must be greater than or equal to 1.
16711674
enable_traditional_pipeline (bool), (default=True):
16721675
We fit traditional machine learning algorithms
16731676
(LightGBM, CatBoost, RandomForest, ExtraTrees, KNN, SVM)
@@ -1756,6 +1759,7 @@ def fit_ensemble(
17561759
precision=precision,
17571760
ensemble_size=ensemble_size,
17581761
ensemble_nbest=ensemble_nbest,
1762+
max_models_on_disc=max_models_on_disc
17591763
)
17601764

17611765
manager.build_ensemble(self._dask_client)
@@ -1777,6 +1781,7 @@ def _init_ensemble_builder(
17771781
optimize_metric: str,
17781782
ensemble_nbest: int,
17791783
ensemble_size: int,
1784+
max_models_on_disc: int = 50,
17801785
precision: int = 32,
17811786
) -> EnsembleBuilderManager:
17821787
"""
@@ -1786,13 +1791,17 @@ def _init_ensemble_builder(
17861791
Time (in seconds) allocated to building the ensemble
17871792
optimize_metric (str):
17881793
Name of the metric to optimize the ensemble.
1789-
ensemble_nbest (int):
1790-
only consider the ensemble_nbest models to build the ensemble.
17911794
ensemble_size (int):
17921795
Number of models added to the ensemble built by
17931796
Ensemble selection from libraries of models.
17941797
Models are drawn with replacement.
1795-
precision (int), (default=32): Numeric precision used when loading
1798+
ensemble_nbest (int):
1799+
Only consider the ensemble_nbest models to build the ensemble
1800+
max_models_on_disc (int: default=50):
1801+
Maximum number of models saved to disc. It also controls the size of
1802+
the ensemble as any additional models will be deleted.
1803+
Must be greater than or equal to 1.
1804+
precision (int: default=32): Numeric precision used when loading
17961805
ensemble data. Can be either 16, 32 or 64.
17971806
17981807
Returns:
@@ -1826,7 +1835,7 @@ def _init_ensemble_builder(
18261835
opt_metric=optimize_metric,
18271836
ensemble_size=ensemble_size,
18281837
ensemble_nbest=ensemble_nbest,
1829-
max_models_on_disc=self.max_models_on_disc,
1838+
max_models_on_disc=max_models_on_disc,
18301839
seed=self.seed,
18311840
max_iterations=None,
18321841
read_at_most=sys.maxsize,

autoPyTorch/api/tabular_classification.py

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -35,18 +35,6 @@ class TabularClassificationTask(BaseTask):
3535
number of threads to use for each process.
3636
logging_config (Optional[Dict]):
3737
Specifies configuration for logging, if None, it is loaded from the logging.yaml
38-
ensemble_size (int: default=50):
39-
Number of models added to the ensemble built by
40-
Ensemble selection from libraries of models.
41-
Models are drawn with replacement.
42-
ensemble_nbest (int: default=50):
43-
Only consider the ensemble_nbest
44-
models to build the ensemble
45-
max_models_on_disc (int: default=50):
46-
Maximum number of models saved to disc.
47-
Also, controls the size of the ensemble
48-
as any additional models will be deleted.
49-
Must be greater than or equal to 1.
5038
temporary_directory (str):
5139
Folder to store configuration output and log file
5240
output_directory (str):
@@ -74,9 +62,6 @@ def __init__(
7462
n_jobs: int = 1,
7563
n_threads: int = 1,
7664
logging_config: Optional[Dict] = None,
77-
ensemble_size: int = 50,
78-
ensemble_nbest: int = 50,
79-
max_models_on_disc: int = 50,
8065
temporary_directory: Optional[str] = None,
8166
output_directory: Optional[str] = None,
8267
delete_tmp_folder_after_terminate: bool = True,
@@ -93,9 +78,6 @@ def __init__(
9378
n_jobs=n_jobs,
9479
n_threads=n_threads,
9580
logging_config=logging_config,
96-
ensemble_size=ensemble_size,
97-
ensemble_nbest=ensemble_nbest,
98-
max_models_on_disc=max_models_on_disc,
9981
temporary_directory=temporary_directory,
10082
output_directory=output_directory,
10183
delete_tmp_folder_after_terminate=delete_tmp_folder_after_terminate,
@@ -235,6 +217,9 @@ def search(
235217
disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
236218
load_models: bool = True,
237219
portfolio_selection: Optional[str] = None,
220+
ensemble_size: int = 50,
221+
ensemble_nbest: int = 50,
222+
max_models_on_disc: int = 50,
238223
) -> 'BaseTask':
239224
"""
240225
Search for the best pipeline configuration for the given dataset.
@@ -361,6 +346,18 @@ def search(
361346
Additionally, the keyword 'greedy' is supported,
362347
which would use the default portfolio from
363348
`AutoPyTorch Tabular <https://arxiv.org/abs/2006.13799>`_.
349+
ensemble_size (int: default=50):
350+
Number of models added to the ensemble built by
351+
Ensemble selection from libraries of models.
352+
Models are drawn with replacement.
353+
ensemble_nbest (int: default=50):
354+
Only consider the ensemble_nbest
355+
models to build the ensemble
356+
max_models_on_disc (int: default=50):
357+
Maximum number of models saved to disc.
358+
Also, controls the size of the ensemble
359+
as any additional models will be deleted.
360+
Must be greater than or equal to 1.
364361
365362
Returns:
366363
self
@@ -393,6 +390,9 @@ def search(
393390
disable_file_output=disable_file_output,
394391
load_models=load_models,
395392
portfolio_selection=portfolio_selection,
393+
ensemble_size=ensemble_size,
394+
ensemble_nbest=ensemble_nbest,
395+
max_models_on_disc=max_models_on_disc,
396396
)
397397

398398
def predict(

autoPyTorch/api/tabular_regression.py

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -35,18 +35,6 @@ class TabularRegressionTask(BaseTask):
3535
number of threads to use for each process.
3636
logging_config (Optional[Dict]):
3737
Specifies configuration for logging, if None, it is loaded from the logging.yaml
38-
ensemble_size (int: default=50):
39-
Number of models added to the ensemble built by
40-
Ensemble selection from libraries of models.
41-
Models are drawn with replacement.
42-
ensemble_nbest (int: default=50):
43-
Only consider the ensemble_nbest
44-
models to build the ensemble
45-
max_models_on_disc (int: default=50):
46-
Maximum number of models saved to disc.
47-
Also, controls the size of the ensemble
48-
as any additional models will be deleted.
49-
Must be greater than or equal to 1.
5038
temporary_directory (str):
5139
Folder to store configuration output and log file
5240
output_directory (str):
@@ -75,9 +63,6 @@ def __init__(
7563
n_jobs: int = 1,
7664
n_threads: int = 1,
7765
logging_config: Optional[Dict] = None,
78-
ensemble_size: int = 50,
79-
ensemble_nbest: int = 50,
80-
max_models_on_disc: int = 50,
8166
temporary_directory: Optional[str] = None,
8267
output_directory: Optional[str] = None,
8368
delete_tmp_folder_after_terminate: bool = True,
@@ -94,9 +79,6 @@ def __init__(
9479
n_jobs=n_jobs,
9580
n_threads=n_threads,
9681
logging_config=logging_config,
97-
ensemble_size=ensemble_size,
98-
ensemble_nbest=ensemble_nbest,
99-
max_models_on_disc=max_models_on_disc,
10082
temporary_directory=temporary_directory,
10183
output_directory=output_directory,
10284
delete_tmp_folder_after_terminate=delete_tmp_folder_after_terminate,
@@ -236,6 +218,9 @@ def search(
236218
disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
237219
load_models: bool = True,
238220
portfolio_selection: Optional[str] = None,
221+
ensemble_size: int = 50,
222+
ensemble_nbest: int = 50,
223+
max_models_on_disc: int = 50,
239224
) -> 'BaseTask':
240225
"""
241226
Search for the best pipeline configuration for the given dataset.
@@ -362,6 +347,18 @@ def search(
362347
Additionally, the keyword 'greedy' is supported,
363348
which would use the default portfolio from
364349
`AutoPyTorch Tabular <https://arxiv.org/abs/2006.13799>`_.
350+
ensemble_size (int: default=50):
351+
Number of models added to the ensemble built by
352+
Ensemble selection from libraries of models.
353+
Models are drawn with replacement.
354+
ensemble_nbest (int: default=50):
355+
Only consider the ensemble_nbest
356+
models to build the ensemble
357+
max_models_on_disc (int: default=50):
358+
Maximum number of models saved to disc.
359+
Also, controls the size of the ensemble
360+
as any additional models will be deleted.
361+
Must be greater than or equal to 1.
365362
366363
Returns:
367364
self
@@ -393,6 +390,9 @@ def search(
393390
disable_file_output=disable_file_output,
394391
load_models=load_models,
395392
portfolio_selection=portfolio_selection,
393+
ensemble_size=ensemble_size,
394+
ensemble_nbest=ensemble_nbest,
395+
max_models_on_disc=max_models_on_disc,
396396
)
397397

398398
def predict(

test/test_api/test_api.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -578,7 +578,6 @@ def test_do_traditional_pipeline(fit_dictionary_tabular):
578578
estimator = TabularClassificationTask(
579579
backend=backend,
580580
resampling_strategy=HoldoutValTypes.holdout_validation,
581-
ensemble_size=0,
582581
)
583582

584583
# Setup pre-requisites normally set by search()

test/test_api/test_base_api.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,6 @@ def test_init_ensemble_builder(backend):
174174
assert proc_ensemble.opt_metric == 'accuracy'
175175
assert proc_ensemble.metrics[0] == accuracy
176176

177-
estimator._close_dask_client()
178-
estimator._clean_logger()
177+
estimator._cleanup()
179178

180179
del estimator

0 commit comments

Comments
 (0)