add github change request

aditya1503 · aditya1503 · commit afbe4a9bf8fb · 2023-12-13T22:32:05.000+05:30
diff --git a/cleanlab_studio/internal/util.py b/cleanlab_studio/internal/util.py
@@ -222,12 +222,12 @@ def check_not_none(x: Any) -> bool:
 
 
 # Studio team port to backend
-def get_autofix_defaults_for_strategy(strategy):
+def _get_autofix_defaults_for_strategy(strategy):
     return AUTOFIX_DEFAULTS[strategy]
 
 
-def get_param_values(cleanset_df, params, strategy):
-    thresholds = get_autofix_defaults_for_strategy(strategy) if params is None else params
+def _get_param_values(cleanset_df, params, strategy):
+    thresholds = _get_autofix_defaults_for_strategy(strategy) if params is None else params
     param_values = {}
     for param_type, param_value in thresholds.items():
         # Convert drop fractions to number of rows and leave rest of the parameters as is
diff --git a/cleanlab_studio/studio/studio.py b/cleanlab_studio/studio/studio.py
@@ -19,8 +19,8 @@
     apply_corrections_spark_df,
     apply_corrections_pd_df,
     apply_autofixed_cleanset_to_new_dataframe,
-    get_autofix_defaults_for_strategy,
-    get_param_values,
+    _get_autofix_defaults_for_strategy,
+    _get_param_values,
 )
 from cleanlab_studio.internal.settings import CleanlabSettings
 from cleanlab_studio.internal.types import FieldSchemaDict
@@ -370,20 +370,30 @@ def autofix_dataset(
         strategy="optimized_training_data",
     ) -> pd.DataFrame:
         """
-        This method returns the auto-fixed dataset.
+        This method returns the auto-fixed dataset. It works for text or tabular dataset only.
         Args:
             cleanset_id (str): ID of cleanset.
+            original_df (pd.DataFrame): The original dataset in DataFrame format.
             params (dict, optional): Default parameter dictionary containing confidence threshold for auto-relabelling, and
                 fraction of rows to drop for each issue type. If not provided, default values will be used.
-
-                Example:
+                This dictionary includes the following options:
+
+                    * drop_ambiguous (float): Fraction of rows to drop when encountering ambiguous data. Default is 0.0 (no rows dropped).
+                    * drop_label_issue (float): Fraction of rows to drop when facing label-related issues. Default is 0.5 (50% of rows dropped).
+                    * drop_near_duplicate (float): Fraction of rows to drop for near-duplicate data. Default is 0.5 (50% of rows dropped).
+                    * drop_outlier (float): Fraction of rows to drop for outlier data. Default is 0.2 (20% of rows dropped).
+                    * relabel_confidence_threshold (float): Confidence threshold for auto-relabelling. Default is 0.95.
+                For example, the default values are:
                 {
                     'drop_ambiguous': 0.0,
                     'drop_label_issue': 0.5,
                     'drop_near_duplicate': 0.5,
                     'drop_outlier': 0.2,
                     'relabel_confidence_threshold': 0.95
                 }
+
+            Specify values in params to customize the behavior for specific scenarios. If params are provided, the values in params take precedence over default ones.
+            
             strategy (str): Auto-fixing strategy to use,
                 Possible strategies: optimized_training_data, drop_all_issues, suggested_actions
 
@@ -394,7 +404,7 @@ def autofix_dataset(
         cleanset_df = self.download_cleanlab_columns(cleanset_id)
         if params is not None and strategy is not None:
             raise ValueError("Please provide only of params or strategy for autofix")
-        param_values = get_param_values(cleanset_df, params, strategy)
+        param_values = _get_param_values(cleanset_df, params, strategy)
         return apply_autofixed_cleanset_to_new_dataframe(original_df, cleanset_df, param_values)
 
     def get_autofix_defaults(self, strategy="optimized_training_data") -> Dict[str, float]:
@@ -408,4 +418,4 @@ def get_autofix_defaults(self, strategy="optimized_training_data") -> Dict[str,
             dict[str, float]: parameter dictionary containing confidence threshold for auto-relabelling, and
                 fraction of rows to drop for each issue type.
         """
-        return get_autofix_defaults_for_strategy(strategy)
+        return _get_autofix_defaults_for_strategy(strategy)
diff --git a/tests/test_autofix_utils.py b/tests/test_autofix_utils.py
@@ -1,7 +1,7 @@
 import pandas as pd
 import pytest
 from cleanlab_studio.internal.util import (
-    get_param_values,
+    _get_param_values,
     _update_label_based_on_confidence,
     _get_top_fraction_ids,
     _get_indices_to_drop,
@@ -50,7 +50,7 @@ def test_get_param_values(self, strategy, expected_results):
         cleanlab_columns["is_outlier"] = [True] * 6 + [False] * 4
         cleanlab_columns["is_ambiguous"] = [True] * 10
 
-        params = get_param_values(cleanlab_columns, None, strategy)
+        params = _get_param_values(cleanlab_columns, None, strategy)
         assert params == expected_results
 
     @pytest.mark.parametrize(