Skip to content

Commit afbe4a9

Browse files
committed
add github change request
1 parent 692efe4 commit afbe4a9

File tree

3 files changed

+22
-12
lines changed

3 files changed

+22
-12
lines changed

cleanlab_studio/internal/util.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -222,12 +222,12 @@ def check_not_none(x: Any) -> bool:
222222

223223

224224
# Studio team port to backend
225-
def get_autofix_defaults_for_strategy(strategy):
225+
def _get_autofix_defaults_for_strategy(strategy):
226226
return AUTOFIX_DEFAULTS[strategy]
227227

228228

229-
def get_param_values(cleanset_df, params, strategy):
230-
thresholds = get_autofix_defaults_for_strategy(strategy) if params is None else params
229+
def _get_param_values(cleanset_df, params, strategy):
230+
thresholds = _get_autofix_defaults_for_strategy(strategy) if params is None else params
231231
param_values = {}
232232
for param_type, param_value in thresholds.items():
233233
# Convert drop fractions to number of rows and leave rest of the parameters as is

cleanlab_studio/studio/studio.py

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@
1919
apply_corrections_spark_df,
2020
apply_corrections_pd_df,
2121
apply_autofixed_cleanset_to_new_dataframe,
22-
get_autofix_defaults_for_strategy,
23-
get_param_values,
22+
_get_autofix_defaults_for_strategy,
23+
_get_param_values,
2424
)
2525
from cleanlab_studio.internal.settings import CleanlabSettings
2626
from cleanlab_studio.internal.types import FieldSchemaDict
@@ -370,20 +370,30 @@ def autofix_dataset(
370370
strategy="optimized_training_data",
371371
) -> pd.DataFrame:
372372
"""
373-
This method returns the auto-fixed dataset.
373+
This method returns the auto-fixed dataset. It works for text or tabular dataset only.
374374
Args:
375375
cleanset_id (str): ID of cleanset.
376+
original_df (pd.DataFrame): The original dataset in DataFrame format.
376377
params (dict, optional): Default parameter dictionary containing confidence threshold for auto-relabelling, and
377378
fraction of rows to drop for each issue type. If not provided, default values will be used.
378-
379-
Example:
379+
This dictionary includes the following options:
380+
381+
* drop_ambiguous (float): Fraction of rows to drop when encountering ambiguous data. Default is 0.0 (no rows dropped).
382+
* drop_label_issue (float): Fraction of rows to drop when facing label-related issues. Default is 0.5 (50% of rows dropped).
383+
* drop_near_duplicate (float): Fraction of rows to drop for near-duplicate data. Default is 0.5 (50% of rows dropped).
384+
* drop_outlier (float): Fraction of rows to drop for outlier data. Default is 0.2 (20% of rows dropped).
385+
* relabel_confidence_threshold (float): Confidence threshold for auto-relabelling. Default is 0.95.
386+
For example, the default values are:
380387
{
381388
'drop_ambiguous': 0.0,
382389
'drop_label_issue': 0.5,
383390
'drop_near_duplicate': 0.5,
384391
'drop_outlier': 0.2,
385392
'relabel_confidence_threshold': 0.95
386393
}
394+
395+
Specify values in params to customize the behavior for specific scenarios. If params are provided, the values in params take precedence over default ones.
396+
387397
strategy (str): Auto-fixing strategy to use,
388398
Possible strategies: optimized_training_data, drop_all_issues, suggested_actions
389399
@@ -394,7 +404,7 @@ def autofix_dataset(
394404
cleanset_df = self.download_cleanlab_columns(cleanset_id)
395405
if params is not None and strategy is not None:
396406
raise ValueError("Please provide only of params or strategy for autofix")
397-
param_values = get_param_values(cleanset_df, params, strategy)
407+
param_values = _get_param_values(cleanset_df, params, strategy)
398408
return apply_autofixed_cleanset_to_new_dataframe(original_df, cleanset_df, param_values)
399409

400410
def get_autofix_defaults(self, strategy="optimized_training_data") -> Dict[str, float]:
@@ -408,4 +418,4 @@ def get_autofix_defaults(self, strategy="optimized_training_data") -> Dict[str,
408418
dict[str, float]: parameter dictionary containing confidence threshold for auto-relabelling, and
409419
fraction of rows to drop for each issue type.
410420
"""
411-
return get_autofix_defaults_for_strategy(strategy)
421+
return _get_autofix_defaults_for_strategy(strategy)

tests/test_autofix_utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import pandas as pd
22
import pytest
33
from cleanlab_studio.internal.util import (
4-
get_param_values,
4+
_get_param_values,
55
_update_label_based_on_confidence,
66
_get_top_fraction_ids,
77
_get_indices_to_drop,
@@ -50,7 +50,7 @@ def test_get_param_values(self, strategy, expected_results):
5050
cleanlab_columns["is_outlier"] = [True] * 6 + [False] * 4
5151
cleanlab_columns["is_ambiguous"] = [True] * 10
5252

53-
params = get_param_values(cleanlab_columns, None, strategy)
53+
params = _get_param_values(cleanlab_columns, None, strategy)
5454
assert params == expected_results
5555

5656
@pytest.mark.parametrize(

0 commit comments

Comments
 (0)