move ensemble arguments to search function

ravinkohli · ravinkohli · commit 23075d6933b5 · 2022-02-09T12:34:21.000+01:00
diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
@@ -123,16 +123,6 @@ class BaseTask(ABC):
             Number of threads to use for each process.
         logging_config (Optional[Dict]):
             Specifies configuration for logging, if None, it is loaded from the logging.yaml
-        ensemble_size (int: default=50):
-            Number of models added to the ensemble built by
-            Ensemble selection from libraries of models.
-            Models are drawn with replacement.
-        ensemble_nbest (int: default=50):
-            Only consider the ensemble_nbest models to build the ensemble
-        max_models_on_disc (int: default=50):
-            Maximum number of models saved to disc. It also controls the size of
-            the ensemble as any additional models will be deleted.
-            Must be greater than or equal to 1.
         temporary_directory (str):
             Folder to store configuration output and log file
         output_directory (str):
@@ -168,9 +158,6 @@ def __init__(
         n_jobs: int = 1,
         n_threads: int = 1,
         logging_config: Optional[Dict] = None,
-        ensemble_size: int = 50,
-        ensemble_nbest: int = 50,
-        max_models_on_disc: int = 50,
         temporary_directory: Optional[str] = None,
         output_directory: Optional[str] = None,
         delete_tmp_folder_after_terminate: bool = True,
@@ -190,9 +177,6 @@ def __init__(
         self.seed = seed
         self.n_jobs = n_jobs
         self.n_threads = n_threads
-        self.ensemble_size = ensemble_size
-        self.ensemble_nbest = ensemble_nbest
-        self.max_models_on_disc = max_models_on_disc
         self.logging_config: Optional[Dict] = logging_config
         self.include_components: Optional[Dict] = include_components
         self.exclude_components: Optional[Dict] = exclude_components
@@ -925,7 +909,10 @@ def _search(
         disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
         load_models: bool = True,
         portfolio_selection: Optional[str] = None,
-        dask_client: Optional[dask.distributed.Client] = None
+        dask_client: Optional[dask.distributed.Client] = None,
+        ensemble_size: int = 50,
+        ensemble_nbest: int = 50,
+        max_models_on_disc: int = 50,
     ) -> 'BaseTask':
         """
         Search for the best pipeline configuration for the given dataset.
@@ -1053,6 +1040,16 @@ def _search(
                 Additionally, the keyword 'greedy' is supported,
                 which would use the default portfolio from
                 `AutoPyTorch Tabular <https://arxiv.org/abs/2006.13799>`_
+            ensemble_size (int: default=50):
+                Number of models added to the ensemble built by
+                Ensemble selection from libraries of models.
+                Models are drawn with replacement.
+            ensemble_nbest (int: default=50):
+                Only consider the ensemble_nbest models to build the ensemble
+            max_models_on_disc (int: default=50):
+                Maximum number of models saved to disc. It also controls the size of
+                the ensemble as any additional models will be deleted.
+                Must be greater than or equal to 1.
 
         Returns:
             self
@@ -1086,13 +1083,14 @@ def _search(
         self._disable_file_output = disable_file_output if disable_file_output is not None else []
         if (
             DisableFileOutputParameters.y_optimization in self._disable_file_output
-            and self.ensemble_size > 1
+            and ensemble_size > 1
         ):
             self._logger.warning(f"No ensemble will be created when {DisableFileOutputParameters.y_optimization}"
                                  f" is in disable_file_output")
 
         self._memory_limit = memory_limit
         self._time_for_task = total_walltime_limit
+
         # Save start time to backend
         self._backend.save_start_time(str(self.seed))
 
@@ -1153,7 +1151,7 @@ def _search(
 
         # Make sure that at least 2 models are created for the ensemble process
         num_models = time_left_for_modelfit // func_eval_time_limit_secs
-        if num_models < 2 and self.ensemble_size > 0:
+        if num_models < 2 and ensemble_size > 0:
             func_eval_time_limit_secs = time_left_for_modelfit // 2
             self._logger.warning(
                 "Capping the func_eval_time_limit_secs to {} to have "
@@ -1164,7 +1162,7 @@ def _search(
 
         # ============> Run dummy predictions
         # We only want to run dummy predictions in case we want to build an ensemble
-        if self.ensemble_size > 0:
+        if ensemble_size > 0:
             dummy_task_name = 'runDummy'
             self._stopwatch.start_task(dummy_task_name)
             self._do_dummy_prediction()
@@ -1173,7 +1171,7 @@ def _search(
         # ============> Run traditional ml
         # We only want to run traditional predictions in case we want to build an ensemble
         # We want time for at least 1 Neural network in SMAC
-        if enable_traditional_pipeline and self.ensemble_size > 0:
+        if enable_traditional_pipeline and ensemble_size > 0:
             traditional_runtime_limit = int(self._time_for_task - func_eval_time_limit_secs)
             self.run_traditional_ml(current_task_name=self.dataset_name,
                                     runtime_limit=traditional_runtime_limit,
@@ -1188,21 +1186,22 @@ def _search(
         if time_left_for_ensembles <= 0:
             # Fit only raises error when ensemble_size is not zero but
             # time_left_for_ensembles is zero.
-            if self.ensemble_size > 0:
+            if ensemble_size > 0:
                 raise ValueError("Not starting ensemble builder because there "
                                  "is no time left. Try increasing the value "
                                  "of time_left_for_this_task.")
-        elif self.ensemble_size <= 0:
+        elif ensemble_size <= 0:
             self._logger.info("Not starting ensemble builder as ensemble size is 0")
         else:
             self._logger.info("Starting ensemble")
             ensemble_task_name = 'ensemble'
             self._stopwatch.start_task(ensemble_task_name)
             proc_ensemble = self._init_ensemble_builder(time_left_for_ensembles=time_left_for_ensembles,
-                                                        ensemble_size=self.ensemble_size,
-                                                        ensemble_nbest=self.ensemble_nbest,
+                                                        ensemble_size=ensemble_size,
+                                                        ensemble_nbest=ensemble_nbest,
                                                         precision=precision,
-                                                        optimize_metric=self.opt_metric
+                                                        optimize_metric=self.opt_metric,
+                                                        max_models_on_disc=max_models_on_disc
                                                         )
             self._stopwatch.stop_task(ensemble_task_name)
 
@@ -1662,6 +1661,7 @@ def fit_ensemble(
         precision: Optional[int] = None,
         ensemble_nbest: int = 50,
         ensemble_size: int = 50,
+        max_models_on_disc: int = 50,
         load_models: bool = True,
         time_for_task: int = 100,
         func_eval_time_limit_secs: int = 50,
@@ -1677,13 +1677,16 @@ def fit_ensemble(
                 evaluate a pipeline. if not specified, value passed to search will be used
             precision (Optional[int]): Numeric precision used when loading
                 ensemble data. Can be either 16, 32 or 64.
-            ensemble_nbest (Optional[int]):
-                only consider the ensemble_nbest models to build the ensemble.
-                If None, uses the value stored in class attribute `ensemble_nbest`.
-            ensemble_size (int) (default=50):
+            ensemble_size (int: default=50):
                 Number of models added to the ensemble built by
                 Ensemble selection from libraries of models.
                 Models are drawn with replacement.
+            ensemble_nbest (int: default=50):
+                Only consider the ensemble_nbest models to build the ensemble
+            max_models_on_disc (int: default=50):
+                Maximum number of models saved to disc. It also controls the size of
+                the ensemble as any additional models will be deleted.
+                Must be greater than or equal to 1.
             enable_traditional_pipeline (bool), (default=True):
                 We fit traditional machine learning algorithms
                 (LightGBM, CatBoost, RandomForest, ExtraTrees, KNN, SVM)
@@ -1772,6 +1775,7 @@ def fit_ensemble(
             precision=precision,
             ensemble_size=ensemble_size,
             ensemble_nbest=ensemble_nbest,
+            max_models_on_disc=max_models_on_disc
         )
 
         manager.build_ensemble(self._dask_client)
@@ -1793,6 +1797,7 @@ def _init_ensemble_builder(
         optimize_metric: str,
         ensemble_nbest: int,
         ensemble_size: int,
+        max_models_on_disc: int = 50,
         precision: int = 32,
     ) -> EnsembleBuilderManager:
         """
@@ -1802,13 +1807,17 @@ def _init_ensemble_builder(
                 Time (in seconds) allocated to building the ensemble
             optimize_metric (str):
                 Name of the metric to optimize the ensemble.
-            ensemble_nbest (int):
-                only consider the ensemble_nbest models to build the ensemble.
             ensemble_size (int):
                 Number of models added to the ensemble built by
                 Ensemble selection from libraries of models.
                 Models are drawn with replacement.
-            precision (int), (default=32): Numeric precision used when loading
+            ensemble_nbest (int):
+                Only consider the ensemble_nbest models to build the ensemble
+            max_models_on_disc (int: default=50):
+                Maximum number of models saved to disc. It also controls the size of
+                the ensemble as any additional models will be deleted.
+                Must be greater than or equal to 1.
+            precision (int: default=32): Numeric precision used when loading
                 ensemble data. Can be either 16, 32 or 64.
 
         Returns:
@@ -1842,7 +1851,7 @@ def _init_ensemble_builder(
             opt_metric=optimize_metric,
             ensemble_size=ensemble_size,
             ensemble_nbest=ensemble_nbest,
-            max_models_on_disc=self.max_models_on_disc,
+            max_models_on_disc=max_models_on_disc,
             seed=self.seed,
             max_iterations=None,
             read_at_most=sys.maxsize,
diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py
@@ -35,18 +35,6 @@ class TabularClassificationTask(BaseTask):
             number of threads to use for each process.
         logging_config (Optional[Dict]):
             Specifies configuration for logging, if None, it is loaded from the logging.yaml
-        ensemble_size (int: default=50):
-            Number of models added to the ensemble built by
-            Ensemble selection from libraries of models.
-            Models are drawn with replacement.
-        ensemble_nbest (int: default=50):
-            Only consider the ensemble_nbest
-            models to build the ensemble
-        max_models_on_disc (int: default=50):
-            Maximum number of models saved to disc.
-            Also, controls the size of the ensemble
-            as any additional models will be deleted.
-            Must be greater than or equal to 1.
         temporary_directory (str):
             Folder to store configuration output and log file
         output_directory (str):
@@ -81,9 +69,6 @@ def __init__(
         n_jobs: int = 1,
         n_threads: int = 1,
         logging_config: Optional[Dict] = None,
-        ensemble_size: int = 50,
-        ensemble_nbest: int = 50,
-        max_models_on_disc: int = 50,
         temporary_directory: Optional[str] = None,
         output_directory: Optional[str] = None,
         delete_tmp_folder_after_terminate: bool = True,
@@ -100,9 +85,6 @@ def __init__(
             n_jobs=n_jobs,
             n_threads=n_threads,
             logging_config=logging_config,
-            ensemble_size=ensemble_size,
-            ensemble_nbest=ensemble_nbest,
-            max_models_on_disc=max_models_on_disc,
             temporary_directory=temporary_directory,
             output_directory=output_directory,
             delete_tmp_folder_after_terminate=delete_tmp_folder_after_terminate,
@@ -242,6 +224,9 @@ def search(
         disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
         load_models: bool = True,
         portfolio_selection: Optional[str] = None,
+        ensemble_size: int = 50,
+        ensemble_nbest: int = 50,
+        max_models_on_disc: int = 50,
     ) -> 'BaseTask':
         """
         Search for the best pipeline configuration for the given dataset.
@@ -368,6 +353,18 @@ def search(
                 Additionally, the keyword 'greedy' is supported,
                 which would use the default portfolio from
                 `AutoPyTorch Tabular <https://arxiv.org/abs/2006.13799>`_.
+            ensemble_size (int: default=50):
+                Number of models added to the ensemble built by
+                Ensemble selection from libraries of models.
+                Models are drawn with replacement.
+            ensemble_nbest (int: default=50):
+                Only consider the ensemble_nbest
+                models to build the ensemble
+            max_models_on_disc (int: default=50):
+                Maximum number of models saved to disc.
+                Also, controls the size of the ensemble
+                as any additional models will be deleted.
+                Must be greater than or equal to 1.
 
         Returns:
             self
@@ -400,6 +397,9 @@ def search(
             disable_file_output=disable_file_output,
             load_models=load_models,
             portfolio_selection=portfolio_selection,
+            ensemble_size=ensemble_size,
+            ensemble_nbest=ensemble_nbest,
+            max_models_on_disc=max_models_on_disc,
         )
 
     def predict(
diff --git a/autoPyTorch/api/tabular_regression.py b/autoPyTorch/api/tabular_regression.py
@@ -35,18 +35,6 @@ class TabularRegressionTask(BaseTask):
             number of threads to use for each process.
         logging_config (Optional[Dict]):
             Specifies configuration for logging, if None, it is loaded from the logging.yaml
-        ensemble_size (int: default=50):
-            Number of models added to the ensemble built by
-            Ensemble selection from libraries of models.
-            Models are drawn with replacement.
-        ensemble_nbest (int: default=50):
-            Only consider the ensemble_nbest
-            models to build the ensemble
-        max_models_on_disc (int: default=50):
-            Maximum number of models saved to disc.
-            Also, controls the size of the ensemble
-            as any additional models will be deleted.
-            Must be greater than or equal to 1.
         temporary_directory (str):
             Folder to store configuration output and log file
         output_directory (str):
@@ -82,9 +70,6 @@ def __init__(
         n_jobs: int = 1,
         n_threads: int = 1,
         logging_config: Optional[Dict] = None,
-        ensemble_size: int = 50,
-        ensemble_nbest: int = 50,
-        max_models_on_disc: int = 50,
         temporary_directory: Optional[str] = None,
         output_directory: Optional[str] = None,
         delete_tmp_folder_after_terminate: bool = True,
@@ -101,9 +86,6 @@ def __init__(
             n_jobs=n_jobs,
             n_threads=n_threads,
             logging_config=logging_config,
-            ensemble_size=ensemble_size,
-            ensemble_nbest=ensemble_nbest,
-            max_models_on_disc=max_models_on_disc,
             temporary_directory=temporary_directory,
             output_directory=output_directory,
             delete_tmp_folder_after_terminate=delete_tmp_folder_after_terminate,
@@ -243,6 +225,9 @@ def search(
         disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
         load_models: bool = True,
         portfolio_selection: Optional[str] = None,
+        ensemble_size: int = 50,
+        ensemble_nbest: int = 50,
+        max_models_on_disc: int = 50,
     ) -> 'BaseTask':
         """
         Search for the best pipeline configuration for the given dataset.
@@ -369,6 +354,18 @@ def search(
                 Additionally, the keyword 'greedy' is supported,
                 which would use the default portfolio from
                 `AutoPyTorch Tabular <https://arxiv.org/abs/2006.13799>`_.
+            ensemble_size (int: default=50):
+                Number of models added to the ensemble built by
+                Ensemble selection from libraries of models.
+                Models are drawn with replacement.
+            ensemble_nbest (int: default=50):
+                Only consider the ensemble_nbest
+                models to build the ensemble
+            max_models_on_disc (int: default=50):
+                Maximum number of models saved to disc.
+                Also, controls the size of the ensemble
+                as any additional models will be deleted.
+                Must be greater than or equal to 1.
 
         Returns:
             self
@@ -400,6 +397,9 @@ def search(
             disable_file_output=disable_file_output,
             load_models=load_models,
             portfolio_selection=portfolio_selection,
+            ensemble_size=ensemble_size,
+            ensemble_nbest=ensemble_nbest,
+            max_models_on_disc=max_models_on_disc,
         )
 
     def predict(
diff --git a/test/test_api/test_api.py b/test/test_api/test_api.py
@@ -579,7 +579,6 @@ def test_do_traditional_pipeline(fit_dictionary_tabular):
     estimator = TabularClassificationTask(
         backend=backend,
         resampling_strategy=HoldoutValTypes.holdout_validation,
-        ensemble_size=0,
     )
 
     # Setup pre-requisites normally set by search()
diff --git a/test/test_api/test_base_api.py b/test/test_api/test_base_api.py
@@ -189,7 +189,6 @@ def test_init_ensemble_builder(backend):
     assert proc_ensemble.opt_metric == 'accuracy'
     assert proc_ensemble.metrics[0] == accuracy
 
-    estimator._close_dask_client()
-    estimator._clean_logger()
+    estimator._cleanup()
 
-    del estimator
+    del estimator

Original file line number	Diff line number	Diff line change
`@@ -579,7 +579,6 @@ def test_do_traditional_pipeline(fit_dictionary_tabular):`
`579`	`579`	`estimator = TabularClassificationTask(`
`580`	`580`	`backend=backend,`
`581`	`581`	`resampling_strategy=HoldoutValTypes.holdout_validation,`
`582`		`- ensemble_size=0,`
`583`	`582`	`)`
`584`	`583`
`585`	`584`	`# Setup pre-requisites normally set by search()`