From bb4554f441cb31fadb848c4e1645bdea6a0b99bf Mon Sep 17 00:00:00 2001 From: Omswastik-11 Date: Wed, 24 Dec 2025 00:18:32 +0530 Subject: [PATCH 1/3] improved the Getter API for users --- examples/Advanced/tasks_tutorial.py | 28 +++++-- examples/Basics/simple_datasets_tutorial.py | 12 ++- .../Basics/simple_flows_and_runs_tutorial.py | 15 +++- examples/Basics/simple_tasks_tutorial.py | 5 +- openml/__init__.py | 75 ++++++++++++++++++- tests/test_openml/test_openml.py | 24 ++++++ 6 files changed, 143 insertions(+), 16 deletions(-) diff --git a/examples/Advanced/tasks_tutorial.py b/examples/Advanced/tasks_tutorial.py index dff7293ad..1418aa91c 100644 --- a/examples/Advanced/tasks_tutorial.py +++ b/examples/Advanced/tasks_tutorial.py @@ -24,13 +24,15 @@ # # We will start by simply listing only *supervised classification* tasks. # -# **openml.tasks.list_tasks()** returns a dictionary of dictionaries by default, but we -# request a +# **openml.list("task")** (or **openml.tasks.list_tasks()**) returns a dictionary of +# dictionaries by default, but we request a # [pandas dataframe](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html) # instead to have better visualization capabilities and easier access: # %% -tasks = openml.tasks.list_tasks(task_type=TaskType.SUPERVISED_CLASSIFICATION) +tasks = openml.list("task", task_type=TaskType.SUPERVISED_CLASSIFICATION) +# Legacy path still works: +# tasks = openml.tasks.list_tasks(task_type=TaskType.SUPERVISED_CLASSIFICATION) print(tasks.columns) print(f"First 5 of {len(tasks)} tasks:") print(tasks.head()) @@ -66,7 +68,9 @@ # Similar to listing tasks by task type, we can list tasks by tags: # %% -tasks = openml.tasks.list_tasks(tag="OpenML100") +tasks = openml.list("task", tag="OpenML100") +# Legacy path still works: +# tasks = openml.tasks.list_tasks(tag="OpenML100") print(f"First 5 of {len(tasks)} tasks:") print(tasks.head()) @@ -74,7 +78,9 @@ # Furthermore, we can list tasks based on the dataset id: # %% -tasks = openml.tasks.list_tasks(data_id=1471) +tasks = openml.list("task", data_id=1471) +# Legacy path still works: +# tasks = openml.tasks.list_tasks(data_id=1471) print(f"First 5 of {len(tasks)} tasks:") print(tasks.head()) @@ -82,7 +88,9 @@ # In addition, a size limit and an offset can be applied both separately and simultaneously: # %% -tasks = openml.tasks.list_tasks(size=10, offset=50) +tasks = openml.list("task", size=10, offset=50) +# Legacy path still works: +# tasks = openml.tasks.list_tasks(size=10, offset=50) print(tasks) # %% [markdown] @@ -98,7 +106,9 @@ # Finally, it is also possible to list all tasks on OpenML with: # %% -tasks = openml.tasks.list_tasks() +tasks = openml.list("task") +# Legacy path still works: +# tasks = openml.tasks.list_tasks() print(len(tasks)) # %% [markdown] @@ -118,7 +128,9 @@ # %% task_id = 31 -task = openml.tasks.get_task(task_id) +task = openml.get("task", task_id) +# Legacy path still works: +# task = openml.tasks.get_task(task_id) # %% # Properties of the task are stored as member variables: diff --git a/examples/Basics/simple_datasets_tutorial.py b/examples/Basics/simple_datasets_tutorial.py index 75d36ed0f..6d90c22cb 100644 --- a/examples/Basics/simple_datasets_tutorial.py +++ b/examples/Basics/simple_datasets_tutorial.py @@ -14,15 +14,23 @@ # ## List datasets stored on OpenML # %% -datasets_df = openml.datasets.list_datasets() +datasets_df = openml.list("dataset") print(datasets_df.head(n=10)) +# Legacy path still works: +# datasets_df = openml.datasets.list_datasets() + # %% [markdown] # ## Download a dataset # %% # Iris dataset https://www.openml.org/d/61 -dataset = openml.datasets.get_dataset(dataset_id=61) +dataset = openml.get("dataset", 61) +# You can also fetch by name: +# dataset = openml.get("dataset", "Fashion-MNIST") + +# Legacy path still works: +# dataset = openml.datasets.get_dataset(dataset_id=61) # Print a summary print( diff --git a/examples/Basics/simple_flows_and_runs_tutorial.py b/examples/Basics/simple_flows_and_runs_tutorial.py index 41eed9234..f99685f6d 100644 --- a/examples/Basics/simple_flows_and_runs_tutorial.py +++ b/examples/Basics/simple_flows_and_runs_tutorial.py @@ -24,12 +24,25 @@ # %% openml.config.start_using_configuration_for_example() +# %% [markdown] +# ## Quick: list flows and runs via unified entrypoints + +# %% +flows_df = openml.list("flow", size=3) +print(flows_df.head()) + +runs_df = openml.list("run", size=3) +print(runs_df.head()) + # %% [markdown] # ## Train a machine learning model and evaluate it # NOTE: We are using task 119 from the test server: https://test.openml.org/d/20 # %% -task = openml.tasks.get_task(119) +task = openml.get("task", 119) + +# Legacy path still works: +# task = openml.tasks.get_task(119) # Get the data dataset = task.get_dataset() diff --git a/examples/Basics/simple_tasks_tutorial.py b/examples/Basics/simple_tasks_tutorial.py index 598ce4e71..0989d3e1d 100644 --- a/examples/Basics/simple_tasks_tutorial.py +++ b/examples/Basics/simple_tasks_tutorial.py @@ -10,7 +10,10 @@ # [supervised classification on credit-g](https://www.openml.org/search?type=task&id=31&source_data.data_id=31): # %% -task = openml.tasks.get_task(31) +task = openml.get("task", 31) + +# Legacy path still works: +# task = openml.tasks.get_task(31) # %% [markdown] # Get the dataset and its data from the task. diff --git a/openml/__init__.py b/openml/__init__.py index c49505eb9..81aa7b44a 100644 --- a/openml/__init__.py +++ b/openml/__init__.py @@ -18,6 +18,9 @@ # License: BSD 3-Clause from __future__ import annotations +import builtins +from typing import Any, Callable, Dict + from . import ( _api_calls, config, @@ -49,12 +52,74 @@ OpenMLTask, ) +ListDispatcher = Dict[str, Callable[..., Any]] +GetDispatcher = Dict[str, Callable[..., Any]] + + +def list(object_type: str, /, **kwargs: Any) -> Any: # noqa: A001 + """List OpenML objects by type (e.g., datasets, tasks, flows, runs). + + This is a convenience dispatcher that forwards to the existing type-specific + ``list_*`` functions. Existing imports remain available for backward compatibility. + """ + dispatch: ListDispatcher = { + "dataset": datasets.functions.list_datasets, + "task": tasks.functions.list_tasks, + "flow": flows.functions.list_flows, + "run": runs.functions.list_runs, + } + + try: + func = dispatch[object_type.lower()] + except KeyError as exc: # pragma: no cover - defensive branch + raise ValueError( + "Unsupported object_type for list; expected one of 'dataset', 'task', 'flow', 'run'.", + ) from exc + + return func(**kwargs) + + +def get(object_type_or_name: Any, identifier: Any | None = None, /, **kwargs: Any) -> Any: + """Get an OpenML object by type and identifier, or a dataset by name. + + Examples + -------- + openml.get("dataset", 61) + openml.get("dataset", "Fashion-MNIST") + openml.get("task", 31) + openml.get("flow", 10) + openml.get("run", 20) + openml.get("Fashion-MNIST") # dataset lookup by name (no type specified) + """ + # Single-argument shortcut: treat string without type as dataset lookup. + if identifier is None: + if isinstance(object_type_or_name, str): + return datasets.functions.get_dataset(object_type_or_name, **kwargs) + raise ValueError("Please provide an object_type when identifier is not provided.") + + object_type = str(object_type_or_name).lower() + dispatch: GetDispatcher = { + "dataset": datasets.functions.get_dataset, + "task": tasks.functions.get_task, + "flow": flows.functions.get_flow, + "run": runs.functions.get_run, + } + + try: + func = dispatch[object_type] + except KeyError as exc: # pragma: no cover - defensive branch + raise ValueError( + "Unsupported object_type for get; expected one of 'dataset', 'task', 'flow', 'run'.", + ) from exc + + return func(identifier, **kwargs) + def populate_cache( - task_ids: list[int] | None = None, - dataset_ids: list[int | str] | None = None, - flow_ids: list[int] | None = None, - run_ids: list[int] | None = None, + task_ids: builtins.list[int] | None = None, + dataset_ids: builtins.list[int | str] | None = None, + flow_ids: builtins.list[int] | None = None, + run_ids: builtins.list[int] | None = None, ) -> None: """ Populate a cache for offline and parallel usage of the OpenML connector. @@ -91,6 +156,8 @@ def populate_cache( __all__ = [ + "list", + "get", "OpenMLDataset", "OpenMLDataFeature", "OpenMLRun", diff --git a/tests/test_openml/test_openml.py b/tests/test_openml/test_openml.py index 998046726..0cd3b8211 100644 --- a/tests/test_openml/test_openml.py +++ b/tests/test_openml/test_openml.py @@ -41,3 +41,27 @@ def test_populate_cache( assert task_mock.call_count == 2 for argument, fixture in zip(task_mock.call_args_list, [(1,), (2,)]): assert argument[0] == fixture + + @mock.patch("openml.tasks.functions.list_tasks") + @mock.patch("openml.datasets.functions.list_datasets") + def test_list_dispatch(self, list_datasets_mock, list_tasks_mock): + openml.list("dataset", output_format="dataframe") + list_datasets_mock.assert_called_once_with(output_format="dataframe") + + openml.list("task", size=5) + list_tasks_mock.assert_called_once_with(size=5) + + @mock.patch("openml.tasks.functions.get_task") + @mock.patch("openml.datasets.functions.get_dataset") + def test_get_dispatch(self, get_dataset_mock, get_task_mock): + openml.get("dataset", 61) + get_dataset_mock.assert_called_with(61) + + openml.get("dataset", "Fashion-MNIST", version=2) + get_dataset_mock.assert_called_with("Fashion-MNIST", version=2) + + openml.get("Fashion-MNIST") + get_dataset_mock.assert_called_with("Fashion-MNIST") + + openml.get("task", 31) + get_task_mock.assert_called_with(31) From 990583e1210a80dc5ef658b0e6577a5d8281a0ff Mon Sep 17 00:00:00 2001 From: Omswastik-11 Date: Thu, 25 Dec 2025 13:58:35 +0530 Subject: [PATCH 2/3] rename list method to list_all --- examples/Advanced/tasks_tutorial.py | 14 +- examples/Basics/simple_datasets_tutorial.py | 6 +- .../Basics/simple_flows_and_runs_tutorial.py | 6 +- examples/Basics/simple_tasks_tutorial.py | 2 +- openml/__init__.py | 159 +++++++++++------- 5 files changed, 110 insertions(+), 77 deletions(-) diff --git a/examples/Advanced/tasks_tutorial.py b/examples/Advanced/tasks_tutorial.py index 1418aa91c..f03eb65a8 100644 --- a/examples/Advanced/tasks_tutorial.py +++ b/examples/Advanced/tasks_tutorial.py @@ -24,13 +24,13 @@ # # We will start by simply listing only *supervised classification* tasks. # -# **openml.list("task")** (or **openml.tasks.list_tasks()**) returns a dictionary of +# **openml.list_all("task")** (or **openml.tasks.list_tasks()**) returns a dictionary of # dictionaries by default, but we request a # [pandas dataframe](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html) # instead to have better visualization capabilities and easier access: # %% -tasks = openml.list("task", task_type=TaskType.SUPERVISED_CLASSIFICATION) +tasks = openml.list_all("task", task_type=TaskType.SUPERVISED_CLASSIFICATION) # Legacy path still works: # tasks = openml.tasks.list_tasks(task_type=TaskType.SUPERVISED_CLASSIFICATION) print(tasks.columns) @@ -68,7 +68,7 @@ # Similar to listing tasks by task type, we can list tasks by tags: # %% -tasks = openml.list("task", tag="OpenML100") +tasks = openml.list_all("task", tag="OpenML100") # Legacy path still works: # tasks = openml.tasks.list_tasks(tag="OpenML100") print(f"First 5 of {len(tasks)} tasks:") @@ -78,7 +78,7 @@ # Furthermore, we can list tasks based on the dataset id: # %% -tasks = openml.list("task", data_id=1471) +tasks = openml.list_all("task", data_id=1471) # Legacy path still works: # tasks = openml.tasks.list_tasks(data_id=1471) print(f"First 5 of {len(tasks)} tasks:") @@ -88,7 +88,7 @@ # In addition, a size limit and an offset can be applied both separately and simultaneously: # %% -tasks = openml.list("task", size=10, offset=50) +tasks = openml.list_all("task", size=10, offset=50) # Legacy path still works: # tasks = openml.tasks.list_tasks(size=10, offset=50) print(tasks) @@ -106,7 +106,7 @@ # Finally, it is also possible to list all tasks on OpenML with: # %% -tasks = openml.list("task") +tasks = openml.list_all("task") # Legacy path still works: # tasks = openml.tasks.list_tasks() print(len(tasks)) @@ -128,7 +128,7 @@ # %% task_id = 31 -task = openml.get("task", task_id) +task = openml.get(task_id, object_type="task") # Legacy path still works: # task = openml.tasks.get_task(task_id) diff --git a/examples/Basics/simple_datasets_tutorial.py b/examples/Basics/simple_datasets_tutorial.py index 6d90c22cb..beb15eec2 100644 --- a/examples/Basics/simple_datasets_tutorial.py +++ b/examples/Basics/simple_datasets_tutorial.py @@ -14,7 +14,7 @@ # ## List datasets stored on OpenML # %% -datasets_df = openml.list("dataset") +datasets_df = openml.list_all("dataset") print(datasets_df.head(n=10)) # Legacy path still works: @@ -25,9 +25,9 @@ # %% # Iris dataset https://www.openml.org/d/61 -dataset = openml.get("dataset", 61) +dataset = openml.get(61) # You can also fetch by name: -# dataset = openml.get("dataset", "Fashion-MNIST") +# dataset = openml.get("Fashion-MNIST") # Legacy path still works: # dataset = openml.datasets.get_dataset(dataset_id=61) diff --git a/examples/Basics/simple_flows_and_runs_tutorial.py b/examples/Basics/simple_flows_and_runs_tutorial.py index f99685f6d..8c2f13748 100644 --- a/examples/Basics/simple_flows_and_runs_tutorial.py +++ b/examples/Basics/simple_flows_and_runs_tutorial.py @@ -28,10 +28,10 @@ # ## Quick: list flows and runs via unified entrypoints # %% -flows_df = openml.list("flow", size=3) +flows_df = openml.list_all("flow", size=3) print(flows_df.head()) -runs_df = openml.list("run", size=3) +runs_df = openml.list_all("run", size=3) print(runs_df.head()) # %% [markdown] @@ -39,7 +39,7 @@ # NOTE: We are using task 119 from the test server: https://test.openml.org/d/20 # %% -task = openml.get("task", 119) +task = openml.get(119, object_type="task") # Legacy path still works: # task = openml.tasks.get_task(119) diff --git a/examples/Basics/simple_tasks_tutorial.py b/examples/Basics/simple_tasks_tutorial.py index 0989d3e1d..3858eecc0 100644 --- a/examples/Basics/simple_tasks_tutorial.py +++ b/examples/Basics/simple_tasks_tutorial.py @@ -10,7 +10,7 @@ # [supervised classification on credit-g](https://www.openml.org/search?type=task&id=31&source_data.data_id=31): # %% -task = openml.get("task", 31) +task = openml.get(31, object_type="task") # Legacy path still works: # task = openml.tasks.get_task(31) diff --git a/openml/__init__.py b/openml/__init__.py index 81aa7b44a..45eec3242 100644 --- a/openml/__init__.py +++ b/openml/__init__.py @@ -18,7 +18,6 @@ # License: BSD 3-Clause from __future__ import annotations -import builtins from typing import Any, Callable, Dict from . import ( @@ -55,71 +54,105 @@ ListDispatcher = Dict[str, Callable[..., Any]] GetDispatcher = Dict[str, Callable[..., Any]] +_LIST_DISPATCH: ListDispatcher = { + "dataset": datasets.functions.list_datasets, + "task": tasks.functions.list_tasks, + "flow": flows.functions.list_flows, + "run": runs.functions.list_runs, +} -def list(object_type: str, /, **kwargs: Any) -> Any: # noqa: A001 +_GET_DISPATCH: GetDispatcher = { + "dataset": datasets.functions.get_dataset, + "task": tasks.functions.get_task, + "flow": flows.functions.get_flow, + "run": runs.functions.get_run, +} + + +def list_all(object_type: str, /, **kwargs: Any) -> Any: """List OpenML objects by type (e.g., datasets, tasks, flows, runs). This is a convenience dispatcher that forwards to the existing type-specific ``list_*`` functions. Existing imports remain available for backward compatibility. + + Parameters + ---------- + object_type : str + The type of object to list. Must be one of 'dataset', 'task', 'flow', 'run'. + **kwargs : Any + Additional arguments passed to the underlying list function. + + Returns + ------- + Any + The result from the type-specific list function (typically a DataFrame). + + Raises + ------ + ValueError + If object_type is not one of the supported types. """ - dispatch: ListDispatcher = { - "dataset": datasets.functions.list_datasets, - "task": tasks.functions.list_tasks, - "flow": flows.functions.list_flows, - "run": runs.functions.list_runs, - } - - try: - func = dispatch[object_type.lower()] - except KeyError as exc: # pragma: no cover - defensive branch + if not isinstance(object_type, str): + raise TypeError(f"object_type must be a string, got {type(object_type).__name__}") + + func = _LIST_DISPATCH.get(object_type.lower()) + if func is None: + valid_types = ", ".join(repr(k) for k in _LIST_DISPATCH) raise ValueError( - "Unsupported object_type for list; expected one of 'dataset', 'task', 'flow', 'run'.", - ) from exc + f"Unsupported object_type {object_type!r}; expected one of {valid_types}.", + ) return func(**kwargs) -def get(object_type_or_name: Any, identifier: Any | None = None, /, **kwargs: Any) -> Any: - """Get an OpenML object by type and identifier, or a dataset by name. +def get(identifier: int | str, *, object_type: str = "dataset", **kwargs: Any) -> Any: + """Get an OpenML object by identifier. + + Parameters + ---------- + identifier : int | str + The ID or name of the object to retrieve. + object_type : str, default="dataset" + The type of object to get. Must be one of 'dataset', 'task', 'flow', 'run'. + **kwargs : Any + Additional arguments passed to the underlying get function. + + Returns + ------- + Any + The requested OpenML object. + + Raises + ------ + ValueError + If object_type is not one of the supported types. Examples -------- - openml.get("dataset", 61) - openml.get("dataset", "Fashion-MNIST") - openml.get("task", 31) - openml.get("flow", 10) - openml.get("run", 20) - openml.get("Fashion-MNIST") # dataset lookup by name (no type specified) + >>> openml.get(61) # Get dataset 61 (default object_type="dataset") + >>> openml.get("Fashion-MNIST") # Get dataset by name + >>> openml.get(31, object_type="task") # Get task 31 + >>> openml.get(10, object_type="flow") # Get flow 10 + >>> openml.get(20, object_type="run") # Get run 20 """ - # Single-argument shortcut: treat string without type as dataset lookup. - if identifier is None: - if isinstance(object_type_or_name, str): - return datasets.functions.get_dataset(object_type_or_name, **kwargs) - raise ValueError("Please provide an object_type when identifier is not provided.") - - object_type = str(object_type_or_name).lower() - dispatch: GetDispatcher = { - "dataset": datasets.functions.get_dataset, - "task": tasks.functions.get_task, - "flow": flows.functions.get_flow, - "run": runs.functions.get_run, - } - - try: - func = dispatch[object_type] - except KeyError as exc: # pragma: no cover - defensive branch + if not isinstance(object_type, str): + raise TypeError(f"object_type must be a string, got {type(object_type).__name__}") + + func = _GET_DISPATCH.get(object_type.lower()) + if func is None: + valid_types = ", ".join(repr(k) for k in _GET_DISPATCH) raise ValueError( - "Unsupported object_type for get; expected one of 'dataset', 'task', 'flow', 'run'.", - ) from exc + f"Unsupported object_type {object_type!r}; expected one of {valid_types}.", + ) return func(identifier, **kwargs) def populate_cache( - task_ids: builtins.list[int] | None = None, - dataset_ids: builtins.list[int | str] | None = None, - flow_ids: builtins.list[int] | None = None, - run_ids: builtins.list[int] | None = None, + task_ids: list[int] | None = None, + dataset_ids: list[int | str] | None = None, + flow_ids: list[int] | None = None, + run_ids: list[int] | None = None, ) -> None: """ Populate a cache for offline and parallel usage of the OpenML connector. @@ -156,35 +189,35 @@ def populate_cache( __all__ = [ - "list", - "get", - "OpenMLDataset", + "OpenMLBenchmarkSuite", + "OpenMLClassificationTask", + "OpenMLClusteringTask", "OpenMLDataFeature", - "OpenMLRun", - "OpenMLSplit", + "OpenMLDataset", "OpenMLEvaluation", - "OpenMLSetup", - "OpenMLParameter", - "OpenMLTask", - "OpenMLSupervisedTask", - "OpenMLClusteringTask", + "OpenMLFlow", "OpenMLLearningCurveTask", + "OpenMLParameter", "OpenMLRegressionTask", - "OpenMLClassificationTask", - "OpenMLFlow", + "OpenMLRun", + "OpenMLSetup", + "OpenMLSplit", "OpenMLStudy", - "OpenMLBenchmarkSuite", + "OpenMLSupervisedTask", + "OpenMLTask", + "__version__", + "_api_calls", + "config", "datasets", "evaluations", "exceptions", "extensions", - "config", - "runs", "flows", - "tasks", + "get", + "list_all", + "runs", "setups", "study", + "tasks", "utils", - "_api_calls", - "__version__", ] From dbb13b560b537bb3abb5d9eee958625979c56c9a Mon Sep 17 00:00:00 2001 From: Omswastik-11 Date: Thu, 25 Dec 2025 14:04:37 +0530 Subject: [PATCH 3/3] update __init.py --- openml/__init__.py | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/openml/__init__.py b/openml/__init__.py index 45eec3242..385180732 100644 --- a/openml/__init__.py +++ b/openml/__init__.py @@ -189,35 +189,35 @@ def populate_cache( __all__ = [ - "OpenMLBenchmarkSuite", - "OpenMLClassificationTask", - "OpenMLClusteringTask", - "OpenMLDataFeature", "OpenMLDataset", + "OpenMLDataFeature", + "OpenMLRun", + "OpenMLSplit", "OpenMLEvaluation", - "OpenMLFlow", - "OpenMLLearningCurveTask", + "OpenMLSetup", "OpenMLParameter", + "OpenMLTask", + "OpenMLSupervisedTask", + "OpenMLClusteringTask", + "OpenMLLearningCurveTask", "OpenMLRegressionTask", - "OpenMLRun", - "OpenMLSetup", - "OpenMLSplit", + "OpenMLClassificationTask", + "OpenMLFlow", "OpenMLStudy", - "OpenMLSupervisedTask", - "OpenMLTask", - "__version__", - "_api_calls", - "config", + "OpenMLBenchmarkSuite", "datasets", "evaluations", "exceptions", "extensions", - "flows", - "get", - "list_all", + "config", "runs", + "flows", + "tasks", "setups", "study", - "tasks", "utils", + "_api_calls", + "__version__", + "get", + "list_all", ]