Skip to content
4 changes: 4 additions & 0 deletions openml/tasks/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -425,6 +425,9 @@ def get_task(
# Including class labels as part of task meta data handles
# the case where data download was initially disabled
if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)):
assert task.target_name is not None, (
"Supervised tasks must define a target feature before retrieving class labels."
)
task.class_labels = dataset.retrieve_class_labels(task.target_name)
# Clustering tasks do not have class labels
# and do not offer download_split
Expand Down Expand Up @@ -598,6 +601,7 @@ def create_task(
)

return task_cls(
task_id=None,
task_type_id=task_type,
task_type="None", # TODO: refactor to get task type string from ID.
data_set_id=dataset_id,
Expand Down
153 changes: 47 additions & 106 deletions openml/tasks/task.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
# License: BSD 3-Clause
# TODO(eddbergman): Seems like a lot of the subclasses could just get away with setting
# a `ClassVar` for whatever changes as their `__init__` defaults, less duplicated code.
from __future__ import annotations

import warnings
from abc import ABC
from collections.abc import Sequence
from enum import Enum
from pathlib import Path
from typing import TYPE_CHECKING, Any
from typing import TYPE_CHECKING, Any, ClassVar
from typing_extensions import TypedDict

import openml._api_calls
Expand Down Expand Up @@ -71,31 +69,45 @@ class OpenMLTask(OpenMLBase):
Refers to the URL of the data splits used for the OpenML task.
"""

DEFAULT_ESTIMATION_PROCEDURE_ID: ClassVar[int] = 1

def __init__( # noqa: PLR0913
self,
task_id: int | None,
task_type_id: TaskType,
task_type: str,
data_set_id: int,
estimation_procedure_id: int = 1,
estimation_procedure_id: int | None = None,
estimation_procedure_type: str | None = None,
estimation_parameters: dict[str, str] | None = None,
evaluation_measure: str | None = None,
data_splits_url: str | None = None,
target_name: str | None = None,
):
self.task_id = int(task_id) if task_id is not None else None
self.task_type_id = task_type_id
self.task_type = task_type
self.dataset_id = int(data_set_id)
self.target_name = target_name
resolved_estimation_procedure_id = self._resolve_estimation_procedure_id(
estimation_procedure_id,
)
self.evaluation_measure = evaluation_measure
self.estimation_procedure: _EstimationProcedure = {
"type": estimation_procedure_type,
"parameters": estimation_parameters,
"data_splits_url": data_splits_url,
}
self.estimation_procedure_id = estimation_procedure_id
self.estimation_procedure_id = resolved_estimation_procedure_id
self.split: OpenMLSplit | None = None

def _resolve_estimation_procedure_id(self, estimation_procedure_id: int | None) -> int:
return (
estimation_procedure_id
if estimation_procedure_id is not None
else self.DEFAULT_ESTIMATION_PROCEDURE_ID
)

@classmethod
def _entity_letter(cls) -> str:
return "t"
Expand Down Expand Up @@ -129,7 +141,8 @@ def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | list[str]]]:
if class_labels is not None:
fields["# of Classes"] = len(class_labels)

if hasattr(self, "cost_matrix"):
cost_matrix = getattr(self, "cost_matrix", None)
if cost_matrix is not None:
fields["Cost Matrix"] = "Available"

# determines the order in which the information will be printed
Expand Down Expand Up @@ -250,33 +263,37 @@ class OpenMLSupervisedTask(OpenMLTask, ABC):
Refers to the unique identifier of task.
"""

DEFAULT_ESTIMATION_PROCEDURE_ID: ClassVar[int] = 1

def __init__( # noqa: PLR0913
self,
task_type_id: TaskType,
task_type: str,
data_set_id: int,
target_name: str,
estimation_procedure_id: int = 1,
estimation_procedure_id: int | None = None,
estimation_procedure_type: str | None = None,
estimation_parameters: dict[str, str] | None = None,
evaluation_measure: str | None = None,
data_splits_url: str | None = None,
task_id: int | None = None,
):
resolved_estimation_procedure_id = self._resolve_estimation_procedure_id(
estimation_procedure_id,
)
super().__init__(
task_id=task_id,
task_type_id=task_type_id,
task_type=task_type,
data_set_id=data_set_id,
estimation_procedure_id=estimation_procedure_id,
estimation_procedure_id=resolved_estimation_procedure_id,
estimation_procedure_type=estimation_procedure_type,
estimation_parameters=estimation_parameters,
evaluation_measure=evaluation_measure,
data_splits_url=data_splits_url,
target_name=target_name,
)

self.target_name = target_name

def get_X_and_y(self) -> tuple[pd.DataFrame, pd.Series | pd.DataFrame | None]:
"""Get data associated with the current task.

Expand Down Expand Up @@ -331,6 +348,8 @@ class OpenMLClassificationTask(OpenMLSupervisedTask):

Parameters
----------
task_id : Union[int, None]
ID of the Classification task (if it already exists on OpenML).
task_type_id : TaskType
ID of the Classification task type.
task_type : str
Expand All @@ -339,7 +358,7 @@ class OpenMLClassificationTask(OpenMLSupervisedTask):
ID of the OpenML dataset associated with the Classification task.
target_name : str
Name of the target variable.
estimation_procedure_id : int, default=None
estimation_procedure_id : int, default=1
ID of the estimation procedure for the Classification task.
estimation_procedure_type : str, default=None
Type of the estimation procedure.
Expand All @@ -349,21 +368,21 @@ class OpenMLClassificationTask(OpenMLSupervisedTask):
Name of the evaluation measure.
data_splits_url : str, default=None
URL of the data splits for the Classification task.
task_id : Union[int, None]
ID of the Classification task (if it already exists on OpenML).
class_labels : List of str, default=None
A list of class labels (for classification tasks).
cost_matrix : array, default=None
A cost matrix (for classification tasks).
"""

DEFAULT_ESTIMATION_PROCEDURE_ID: ClassVar[int] = 1

def __init__( # noqa: PLR0913
self,
task_type_id: TaskType,
task_type: str,
data_set_id: int,
target_name: str,
estimation_procedure_id: int = 1,
estimation_procedure_id: int | None = None,
estimation_procedure_type: str | None = None,
estimation_parameters: dict[str, str] | None = None,
evaluation_measure: str | None = None,
Expand All @@ -373,20 +392,19 @@ def __init__( # noqa: PLR0913
cost_matrix: np.ndarray | None = None,
):
super().__init__(
task_id=task_id,
task_type_id=task_type_id,
task_type=task_type,
data_set_id=data_set_id,
target_name=target_name,
estimation_procedure_id=estimation_procedure_id,
estimation_procedure_type=estimation_procedure_type,
estimation_parameters=estimation_parameters,
evaluation_measure=evaluation_measure,
target_name=target_name,
data_splits_url=data_splits_url,
task_id=task_id,
)
self.class_labels = class_labels
self.cost_matrix = cost_matrix

if cost_matrix is not None:
raise NotImplementedError("Costmatrix functionality is not yet implemented.")

Expand All @@ -396,6 +414,8 @@ class OpenMLRegressionTask(OpenMLSupervisedTask):

Parameters
----------
task_id : Union[int, None]
ID of the OpenML Regression task.
task_type_id : TaskType
Task type ID of the OpenML Regression task.
task_type : str
Expand All @@ -404,62 +424,36 @@ class OpenMLRegressionTask(OpenMLSupervisedTask):
ID of the OpenML dataset.
target_name : str
Name of the target feature used in the Regression task.
estimation_procedure_id : int, default=None
estimation_procedure_id : int, default=7
ID of the OpenML estimation procedure.
estimation_procedure_type : str, default=None
Type of the OpenML estimation procedure.
estimation_parameters : dict, default=None
Parameters used by the OpenML estimation procedure.
data_splits_url : str, default=None
URL of the OpenML data splits for the Regression task.
task_id : Union[int, None]
ID of the OpenML Regression task.
evaluation_measure : str, default=None
Evaluation measure used in the Regression task.
"""

def __init__( # noqa: PLR0913
self,
task_type_id: TaskType,
task_type: str,
data_set_id: int,
target_name: str,
estimation_procedure_id: int = 7,
estimation_procedure_type: str | None = None,
estimation_parameters: dict[str, str] | None = None,
data_splits_url: str | None = None,
task_id: int | None = None,
evaluation_measure: str | None = None,
):
super().__init__(
task_id=task_id,
task_type_id=task_type_id,
task_type=task_type,
data_set_id=data_set_id,
estimation_procedure_id=estimation_procedure_id,
estimation_procedure_type=estimation_procedure_type,
estimation_parameters=estimation_parameters,
evaluation_measure=evaluation_measure,
target_name=target_name,
data_splits_url=data_splits_url,
)
DEFAULT_ESTIMATION_PROCEDURE_ID: ClassVar[int] = 7


class OpenMLClusteringTask(OpenMLTask):
"""OpenML Clustering object.

Parameters
----------
task_id : Union[int, None]
ID of the OpenML clustering task.
task_type_id : TaskType
Task type ID of the OpenML clustering task.
task_type : str
Task type of the OpenML clustering task.
data_set_id : int
ID of the OpenML dataset used in clustering the task.
estimation_procedure_id : int, default=None
estimation_procedure_id : int, default=17
ID of the OpenML estimation procedure.
task_id : Union[int, None]
ID of the OpenML clustering task.
estimation_procedure_type : str, default=None
Type of the OpenML estimation procedure used in the clustering task.
estimation_parameters : dict, default=None
Expand All @@ -473,32 +467,7 @@ class OpenMLClusteringTask(OpenMLTask):
feature set for the clustering task.
"""

def __init__( # noqa: PLR0913
self,
task_type_id: TaskType,
task_type: str,
data_set_id: int,
estimation_procedure_id: int = 17,
task_id: int | None = None,
estimation_procedure_type: str | None = None,
estimation_parameters: dict[str, str] | None = None,
data_splits_url: str | None = None,
evaluation_measure: str | None = None,
target_name: str | None = None,
):
super().__init__(
task_id=task_id,
task_type_id=task_type_id,
task_type=task_type,
data_set_id=data_set_id,
evaluation_measure=evaluation_measure,
estimation_procedure_id=estimation_procedure_id,
estimation_procedure_type=estimation_procedure_type,
estimation_parameters=estimation_parameters,
data_splits_url=data_splits_url,
)

self.target_name = target_name
DEFAULT_ESTIMATION_PROCEDURE_ID: ClassVar[int] = 17

def get_X(self) -> pd.DataFrame:
"""Get data associated with the current task.
Expand Down Expand Up @@ -534,6 +503,8 @@ class OpenMLLearningCurveTask(OpenMLClassificationTask):

Parameters
----------
task_id : Union[int, None]
ID of the Learning Curve task.
task_type_id : TaskType
ID of the Learning Curve task.
task_type : str
Expand All @@ -542,16 +513,14 @@ class OpenMLLearningCurveTask(OpenMLClassificationTask):
ID of the dataset that this task is associated with.
target_name : str
Name of the target feature in the dataset.
estimation_procedure_id : int, default=None
estimation_procedure_id : int, default=13
ID of the estimation procedure to use for evaluating models.
estimation_procedure_type : str, default=None
Type of the estimation procedure.
estimation_parameters : dict, default=None
Additional parameters for the estimation procedure.
data_splits_url : str, default=None
URL of the file containing the data splits for Learning Curve task.
task_id : Union[int, None]
ID of the Learning Curve task.
evaluation_measure : str, default=None
Name of the evaluation measure to use for evaluating models.
class_labels : list of str, default=None
Expand All @@ -560,32 +529,4 @@ class OpenMLLearningCurveTask(OpenMLClassificationTask):
Cost matrix for Learning Curve tasks.
"""

def __init__( # noqa: PLR0913
self,
task_type_id: TaskType,
task_type: str,
data_set_id: int,
target_name: str,
estimation_procedure_id: int = 13,
estimation_procedure_type: str | None = None,
estimation_parameters: dict[str, str] | None = None,
data_splits_url: str | None = None,
task_id: int | None = None,
evaluation_measure: str | None = None,
class_labels: list[str] | None = None,
cost_matrix: np.ndarray | None = None,
):
super().__init__(
task_id=task_id,
task_type_id=task_type_id,
task_type=task_type,
data_set_id=data_set_id,
estimation_procedure_id=estimation_procedure_id,
estimation_procedure_type=estimation_procedure_type,
estimation_parameters=estimation_parameters,
evaluation_measure=evaluation_measure,
target_name=target_name,
data_splits_url=data_splits_url,
class_labels=class_labels,
cost_matrix=cost_matrix,
)
DEFAULT_ESTIMATION_PROCEDURE_ID: ClassVar[int] = 13