Skip to content

Commit fd71cb7

Browse files
committed
move ensemble arguments to search function
1 parent 20514cb commit fd71cb7

File tree

5 files changed

+87
-73
lines changed

5 files changed

+87
-73
lines changed

autoPyTorch/api/base_task.py

Lines changed: 43 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -128,16 +128,6 @@ class BaseTask(ABC):
128128
Number of threads to use for each process.
129129
logging_config (Optional[Dict]):
130130
Specifies configuration for logging, if None, it is loaded from the logging.yaml
131-
ensemble_size (int: default=50):
132-
Number of models added to the ensemble built by
133-
Ensemble selection from libraries of models.
134-
Models are drawn with replacement.
135-
ensemble_nbest (int: default=50):
136-
Only consider the ensemble_nbest models to build the ensemble
137-
max_models_on_disc (int: default=50):
138-
Maximum number of models saved to disc. It also controls the size of
139-
the ensemble as any additional models will be deleted.
140-
Must be greater than or equal to 1.
141131
temporary_directory (str):
142132
Folder to store configuration output and log file
143133
output_directory (str):
@@ -173,9 +163,6 @@ def __init__(
173163
n_jobs: int = 1,
174164
n_threads: int = 1,
175165
logging_config: Optional[Dict] = None,
176-
ensemble_size: int = 50,
177-
ensemble_nbest: int = 50,
178-
max_models_on_disc: int = 50,
179166
temporary_directory: Optional[str] = None,
180167
output_directory: Optional[str] = None,
181168
delete_tmp_folder_after_terminate: bool = True,
@@ -195,9 +182,6 @@ def __init__(
195182
self.seed = seed
196183
self.n_jobs = n_jobs
197184
self.n_threads = n_threads
198-
self.ensemble_size = ensemble_size
199-
self.ensemble_nbest = ensemble_nbest
200-
self.max_models_on_disc = max_models_on_disc
201185
self.logging_config: Optional[Dict] = logging_config
202186
self.include_components: Optional[Dict] = include_components
203187
self.exclude_components: Optional[Dict] = exclude_components
@@ -980,6 +964,9 @@ def _search(
980964
load_models: bool = True,
981965
portfolio_selection: Optional[str] = None,
982966
dask_client: Optional[dask.distributed.Client] = None,
967+
ensemble_size: int = 50,
968+
ensemble_nbest: int = 50,
969+
max_models_on_disc: int = 50,
983970
**kwargs: Any
984971
) -> 'BaseTask':
985972
"""
@@ -1108,6 +1095,16 @@ def _search(
11081095
Additionally, the keyword 'greedy' is supported,
11091096
which would use the default portfolio from
11101097
`AutoPyTorch Tabular <https://arxiv.org/abs/2006.13799>`_
1098+
ensemble_size (int: default=50):
1099+
Number of models added to the ensemble built by
1100+
Ensemble selection from libraries of models.
1101+
Models are drawn with replacement.
1102+
ensemble_nbest (int: default=50):
1103+
Only consider the ensemble_nbest models to build the ensemble
1104+
max_models_on_disc (int: default=50):
1105+
Maximum number of models saved to disc. It also controls the size of
1106+
the ensemble as any additional models will be deleted.
1107+
Must be greater than or equal to 1.
11111108
kwargs: Any
11121109
additional arguments that are customed by some specific task.
11131110
For instance, forecasting tasks require:
@@ -1116,6 +1113,7 @@ def _search(
11161113
hyperparameters are determined by the default configurations
11171114
custom_init_setting_path (str): The path to the initial hyperparameter configurations set by
11181115
the users
1116+
11191117
Returns:
11201118
self
11211119
@@ -1148,13 +1146,14 @@ def _search(
11481146
self._disable_file_output = disable_file_output if disable_file_output is not None else []
11491147
if (
11501148
DisableFileOutputParameters.y_optimization in self._disable_file_output
1151-
and self.ensemble_size > 1
1149+
and ensemble_size > 1
11521150
):
11531151
self._logger.warning(f"No ensemble will be created when {DisableFileOutputParameters.y_optimization}"
11541152
f" is in disable_file_output")
11551153

11561154
self._memory_limit = memory_limit
11571155
self._time_for_task = total_walltime_limit
1156+
11581157
# Save start time to backend
11591158
self._backend.save_start_time(str(self.seed))
11601159

@@ -1218,7 +1217,7 @@ def _search(
12181217

12191218
# Make sure that at least 2 models are created for the ensemble process
12201219
num_models = time_left_for_modelfit // func_eval_time_limit_secs
1221-
if num_models < 2 and self.ensemble_size > 0:
1220+
if num_models < 2 and ensemble_size > 0:
12221221
func_eval_time_limit_secs = time_left_for_modelfit // 2
12231222
self._logger.warning(
12241223
"Capping the func_eval_time_limit_secs to {} to have "
@@ -1229,7 +1228,7 @@ def _search(
12291228

12301229
# ============> Run dummy predictions
12311230
# We only want to run dummy predictions in case we want to build an ensemble
1232-
if self.ensemble_size > 0:
1231+
if ensemble_size > 0:
12331232
dummy_task_name = 'runDummy'
12341233
self._stopwatch.start_task(dummy_task_name)
12351234
self._do_dummy_prediction()
@@ -1238,7 +1237,7 @@ def _search(
12381237
# ============> Run traditional ml
12391238
# We only want to run traditional predictions in case we want to build an ensemble
12401239
# We want time for at least 1 Neural network in SMAC
1241-
if enable_traditional_pipeline and self.ensemble_size > 0:
1240+
if enable_traditional_pipeline and ensemble_size > 0:
12421241
traditional_runtime_limit = int(self._time_for_task - func_eval_time_limit_secs)
12431242
self.run_traditional_ml(current_task_name=self.dataset_name,
12441243
runtime_limit=traditional_runtime_limit,
@@ -1253,21 +1252,22 @@ def _search(
12531252
if time_left_for_ensembles <= 0:
12541253
# Fit only raises error when ensemble_size is not zero but
12551254
# time_left_for_ensembles is zero.
1256-
if self.ensemble_size > 0:
1255+
if ensemble_size > 0:
12571256
raise ValueError("Not starting ensemble builder because there "
12581257
"is no time left. Try increasing the value "
12591258
"of time_left_for_this_task.")
1260-
elif self.ensemble_size <= 0:
1259+
elif ensemble_size <= 0:
12611260
self._logger.info("Not starting ensemble builder as ensemble size is 0")
12621261
else:
12631262
self._logger.info("Starting ensemble")
12641263
ensemble_task_name = 'ensemble'
12651264
self._stopwatch.start_task(ensemble_task_name)
12661265
proc_ensemble = self._init_ensemble_builder(time_left_for_ensembles=time_left_for_ensembles,
1267-
ensemble_size=self.ensemble_size,
1268-
ensemble_nbest=self.ensemble_nbest,
1266+
ensemble_size=ensemble_size,
1267+
ensemble_nbest=ensemble_nbest,
12691268
precision=precision,
1270-
optimize_metric=self.opt_metric
1269+
optimize_metric=self.opt_metric,
1270+
max_models_on_disc=max_models_on_disc
12711271
)
12721272
self._stopwatch.stop_task(ensemble_task_name)
12731273

@@ -1740,6 +1740,7 @@ def fit_ensemble(
17401740
precision: Optional[int] = None,
17411741
ensemble_nbest: int = 50,
17421742
ensemble_size: int = 50,
1743+
max_models_on_disc: int = 50,
17431744
load_models: bool = True,
17441745
time_for_task: int = 100,
17451746
func_eval_time_limit_secs: int = 50,
@@ -1755,13 +1756,16 @@ def fit_ensemble(
17551756
evaluate a pipeline. if not specified, value passed to search will be used
17561757
precision (Optional[int]): Numeric precision used when loading
17571758
ensemble data. Can be either 16, 32 or 64.
1758-
ensemble_nbest (Optional[int]):
1759-
only consider the ensemble_nbest models to build the ensemble.
1760-
If None, uses the value stored in class attribute `ensemble_nbest`.
1761-
ensemble_size (int) (default=50):
1759+
ensemble_size (int: default=50):
17621760
Number of models added to the ensemble built by
17631761
Ensemble selection from libraries of models.
17641762
Models are drawn with replacement.
1763+
ensemble_nbest (int: default=50):
1764+
Only consider the ensemble_nbest models to build the ensemble
1765+
max_models_on_disc (int: default=50):
1766+
Maximum number of models saved to disc. It also controls the size of
1767+
the ensemble as any additional models will be deleted.
1768+
Must be greater than or equal to 1.
17651769
enable_traditional_pipeline (bool), (default=True):
17661770
We fit traditional machine learning algorithms
17671771
(LightGBM, CatBoost, RandomForest, ExtraTrees, KNN, SVM)
@@ -1850,6 +1854,7 @@ def fit_ensemble(
18501854
precision=precision,
18511855
ensemble_size=ensemble_size,
18521856
ensemble_nbest=ensemble_nbest,
1857+
max_models_on_disc=max_models_on_disc
18531858
)
18541859

18551860
manager.build_ensemble(self._dask_client)
@@ -1871,6 +1876,7 @@ def _init_ensemble_builder(
18711876
optimize_metric: str,
18721877
ensemble_nbest: int,
18731878
ensemble_size: int,
1879+
max_models_on_disc: int = 50,
18741880
precision: int = 32,
18751881
) -> EnsembleBuilderManager:
18761882
"""
@@ -1880,13 +1886,17 @@ def _init_ensemble_builder(
18801886
Time (in seconds) allocated to building the ensemble
18811887
optimize_metric (str):
18821888
Name of the metric to optimize the ensemble.
1883-
ensemble_nbest (int):
1884-
only consider the ensemble_nbest models to build the ensemble.
18851889
ensemble_size (int):
18861890
Number of models added to the ensemble built by
18871891
Ensemble selection from libraries of models.
18881892
Models are drawn with replacement.
1889-
precision (int), (default=32): Numeric precision used when loading
1893+
ensemble_nbest (int):
1894+
Only consider the ensemble_nbest models to build the ensemble
1895+
max_models_on_disc (int: default=50):
1896+
Maximum number of models saved to disc. It also controls the size of
1897+
the ensemble as any additional models will be deleted.
1898+
Must be greater than or equal to 1.
1899+
precision (int: default=32): Numeric precision used when loading
18901900
ensemble data. Can be either 16, 32 or 64.
18911901
18921902
Returns:
@@ -1920,7 +1930,7 @@ def _init_ensemble_builder(
19201930
opt_metric=optimize_metric,
19211931
ensemble_size=ensemble_size,
19221932
ensemble_nbest=ensemble_nbest,
1923-
max_models_on_disc=self.max_models_on_disc,
1933+
max_models_on_disc=max_models_on_disc,
19241934
seed=self.seed,
19251935
max_iterations=None,
19261936
read_at_most=sys.maxsize,

autoPyTorch/api/tabular_classification.py

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -39,18 +39,6 @@ class TabularClassificationTask(BaseTask):
3939
number of threads to use for each process.
4040
logging_config (Optional[Dict]):
4141
Specifies configuration for logging, if None, it is loaded from the logging.yaml
42-
ensemble_size (int: default=50):
43-
Number of models added to the ensemble built by
44-
Ensemble selection from libraries of models.
45-
Models are drawn with replacement.
46-
ensemble_nbest (int: default=50):
47-
Only consider the ensemble_nbest
48-
models to build the ensemble
49-
max_models_on_disc (int: default=50):
50-
Maximum number of models saved to disc.
51-
Also, controls the size of the ensemble
52-
as any additional models will be deleted.
53-
Must be greater than or equal to 1.
5442
temporary_directory (str):
5543
Folder to store configuration output and log file
5644
output_directory (str):
@@ -85,9 +73,6 @@ def __init__(
8573
n_jobs: int = 1,
8674
n_threads: int = 1,
8775
logging_config: Optional[Dict] = None,
88-
ensemble_size: int = 50,
89-
ensemble_nbest: int = 50,
90-
max_models_on_disc: int = 50,
9176
temporary_directory: Optional[str] = None,
9277
output_directory: Optional[str] = None,
9378
delete_tmp_folder_after_terminate: bool = True,
@@ -104,9 +89,6 @@ def __init__(
10489
n_jobs=n_jobs,
10590
n_threads=n_threads,
10691
logging_config=logging_config,
107-
ensemble_size=ensemble_size,
108-
ensemble_nbest=ensemble_nbest,
109-
max_models_on_disc=max_models_on_disc,
11092
temporary_directory=temporary_directory,
11193
output_directory=output_directory,
11294
delete_tmp_folder_after_terminate=delete_tmp_folder_after_terminate,
@@ -260,6 +242,9 @@ def search(
260242
load_models: bool = True,
261243
portfolio_selection: Optional[str] = None,
262244
dataset_compression: Union[Mapping[str, Any], bool] = False,
245+
ensemble_size: int = 50,
246+
ensemble_nbest: int = 50,
247+
max_models_on_disc: int = 50,
263248
) -> 'BaseTask':
264249
"""
265250
Search for the best pipeline configuration for the given dataset.
@@ -429,6 +414,18 @@ def search(
429414
Subsampling takes into account classification labels and stratifies
430415
accordingly. We guarantee that at least one occurrence of each
431416
label is included in the sampled set.
417+
ensemble_size (int: default=50):
418+
Number of models added to the ensemble built by
419+
Ensemble selection from libraries of models.
420+
Models are drawn with replacement.
421+
ensemble_nbest (int: default=50):
422+
Only consider the ensemble_nbest
423+
models to build the ensemble
424+
max_models_on_disc (int: default=50):
425+
Maximum number of models saved to disc.
426+
Also, controls the size of the ensemble
427+
as any additional models will be deleted.
428+
Must be greater than or equal to 1.
432429
433430
Returns:
434431
self
@@ -464,6 +461,9 @@ def search(
464461
disable_file_output=disable_file_output,
465462
load_models=load_models,
466463
portfolio_selection=portfolio_selection,
464+
ensemble_size=ensemble_size,
465+
ensemble_nbest=ensemble_nbest,
466+
max_models_on_disc=max_models_on_disc,
467467
)
468468

469469
def predict(

autoPyTorch/api/tabular_regression.py

Lines changed: 24 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -39,18 +39,6 @@ class TabularRegressionTask(BaseTask):
3939
number of threads to use for each process.
4040
logging_config (Optional[Dict]):
4141
Specifies configuration for logging, if None, it is loaded from the logging.yaml
42-
ensemble_size (int: default=50):
43-
Number of models added to the ensemble built by
44-
Ensemble selection from libraries of models.
45-
Models are drawn with replacement.
46-
ensemble_nbest (int: default=50):
47-
Only consider the ensemble_nbest
48-
models to build the ensemble
49-
max_models_on_disc (int: default=50):
50-
Maximum number of models saved to disc.
51-
Also, controls the size of the ensemble
52-
as any additional models will be deleted.
53-
Must be greater than or equal to 1.
5442
temporary_directory (str):
5543
Folder to store configuration output and log file
5644
output_directory (str):
@@ -86,9 +74,6 @@ def __init__(
8674
n_jobs: int = 1,
8775
n_threads: int = 1,
8876
logging_config: Optional[Dict] = None,
89-
ensemble_size: int = 50,
90-
ensemble_nbest: int = 50,
91-
max_models_on_disc: int = 50,
9277
temporary_directory: Optional[str] = None,
9378
output_directory: Optional[str] = None,
9479
delete_tmp_folder_after_terminate: bool = True,
@@ -105,9 +90,6 @@ def __init__(
10590
n_jobs=n_jobs,
10691
n_threads=n_threads,
10792
logging_config=logging_config,
108-
ensemble_size=ensemble_size,
109-
ensemble_nbest=ensemble_nbest,
110-
max_models_on_disc=max_models_on_disc,
11193
temporary_directory=temporary_directory,
11294
output_directory=output_directory,
11395
delete_tmp_folder_after_terminate=delete_tmp_folder_after_terminate,
@@ -259,7 +241,13 @@ def search(
259241
disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
260242
load_models: bool = True,
261243
portfolio_selection: Optional[str] = None,
244+
<<<<<<< HEAD
262245
dataset_compression: Union[Mapping[str, Any], bool] = False,
246+
=======
247+
ensemble_size: int = 50,
248+
ensemble_nbest: int = 50,
249+
max_models_on_disc: int = 50,
250+
>>>>>>> move ensemble arguments to search function
263251
) -> 'BaseTask':
264252
"""
265253
Search for the best pipeline configuration for the given dataset.
@@ -390,6 +378,7 @@ def search(
390378
Additionally, the keyword 'greedy' is supported,
391379
which would use the default portfolio from
392380
`AutoPyTorch Tabular <https://arxiv.org/abs/2006.13799>`_.
381+
<<<<<<< HEAD
393382
dataset_compression: Union[bool, Mapping[str, Any]] = True
394383
We compress datasets so that they fit into some predefined amount of memory.
395384
**NOTE**
@@ -429,6 +418,20 @@ def search(
429418
Subsampling takes into account classification labels and stratifies
430419
accordingly. We guarantee that at least one occurrence of each
431420
label is included in the sampled set.
421+
=======
422+
ensemble_size (int: default=50):
423+
Number of models added to the ensemble built by
424+
Ensemble selection from libraries of models.
425+
Models are drawn with replacement.
426+
ensemble_nbest (int: default=50):
427+
Only consider the ensemble_nbest
428+
models to build the ensemble
429+
max_models_on_disc (int: default=50):
430+
Maximum number of models saved to disc.
431+
Also, controls the size of the ensemble
432+
as any additional models will be deleted.
433+
Must be greater than or equal to 1.
434+
>>>>>>> move ensemble arguments to search function
432435
433436
Returns:
434437
self
@@ -465,6 +468,9 @@ def search(
465468
disable_file_output=disable_file_output,
466469
load_models=load_models,
467470
portfolio_selection=portfolio_selection,
471+
ensemble_size=ensemble_size,
472+
ensemble_nbest=ensemble_nbest,
473+
max_models_on_disc=max_models_on_disc,
468474
)
469475

470476
def predict(

test/test_api/test_api.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -759,7 +759,6 @@ def test_do_traditional_pipeline(fit_dictionary_tabular):
759759
estimator = TabularClassificationTask(
760760
backend=backend,
761761
resampling_strategy=HoldoutValTypes.holdout_validation,
762-
ensemble_size=0,
763762
)
764763

765764
# Setup pre-requisites normally set by search()

0 commit comments

Comments
 (0)