Merge pull request #2510 from bzantium:feature/#2509

Google-ML-Automation · Google-ML-Automation · commit 177a75f6c124 · 2025-12-02T11:43:32.000-08:00
PiperOrigin-RevId: 838961966
diff --git a/src/MaxText/configs/base.yml b/src/MaxText/configs/base.yml
@@ -256,7 +256,7 @@ pipeline_delay_activation_forwarding: False # This delays the activation forward
 # and you must set the number of microbatches to at least 2 * num_stages (the minimum 2 * num_stages is set by default with this delay).
 
 model_fsdp_ag_once: False # This controls whether the Zero-1 optimization is active.
-# This is a memory/time tradeoff - True: This is Zero-1 Sharding. Use ZeroOneTransformer to gather weights once per gradient step. 
+# This is a memory/time tradeoff - True: This is Zero-1 Sharding. Use ZeroOneTransformer to gather weights once per gradient step.
 # False: This is Zero-3 Sharing. Use the standard Transformer, which gathers for each microbatch's fwd/bwd pass.
 pipeline_fsdp_ag_once: False # If set to true then all gather all of the weights over FSDP before the first pipeline iteration.
 # This is a memory/time tradeoff - we now have to store the FSDP gathered weights and gradients (typically in bf16), as opposed
@@ -306,7 +306,7 @@ param_scan_axis: 1
 # The attention_type parameter determines the variants of attention, e.g. global or local_sliding
 attention: 'autoselected' # Supported attention: autoselected, dot_product, flash, cudnn_flash_te
 attention_type: 'global' # Supported attention_type: global, local_sliding, chunk, mla
-attention_bias: False # If True, adds a learnable bias to the query, key, and value projections 
+attention_bias: False # If True, adds a learnable bias to the query, key, and value projections
 attention_sink: False
 sliding_window_size: 0
 chunk_attn_window_size: 0
@@ -424,7 +424,7 @@ logical_axis_rules: [
                       ['embed_no_exp', ['fsdp', 'sequence', 'tensor_transpose', 'context']],
                       ['embed_no_exp', ['fsdp', 'fsdp_transpose', 'sequence', 'context']],
                       ['embed_no_exp', ['fsdp', 'sequence', 'context']],
-                      ['embed_tensor_transpose', ['tensor_transpose']],           
+                      ['embed_tensor_transpose', ['tensor_transpose']],
                       ['q_lora', ['fsdp', 'fsdp_transpose', 'sequence', 'context', 'tensor_transpose', 'expert']],
                       ['q_lora', ['fsdp', 'sequence', 'context', 'tensor_transpose', 'expert']],
                       ['q_lora', ['fsdp', 'fsdp_transpose', 'sequence', 'context', 'expert']],
@@ -530,7 +530,7 @@ per_device_batch_size: 12.0
 # Each data-loading host will load per_device_batch_size * expansion_factor_real_data.
 # When set to between 0 and 1, it's for grain pipeline to use a smaller chip count to read checkpoint from a larger chip count job.
 # Details in https://github.com/AI-Hypercomputer/maxtext/blob/main/docs/guides/data_input_grain.md#using-grain
-expansion_factor_real_data: -1.0 
+expansion_factor_real_data: -1.0
 eval_per_device_batch_size: 0.0
 max_corpus_chars: 10_000_000
 train_data_columns: ['text'] # for DPO dataset containing "chosen" and "rejected"
@@ -595,14 +595,15 @@ grain_train_files: ''
 grain_eval_files: ''
 grain_train_mixture_config_path: '' # Path to a JSON file specifying the mixture weights for Grain training data.
 grain_file_type: 'arrayrecord' # arrayrecord or parquet
-grain_worker_count: 1
+grain_worker_count: 1 # Set to -1 to enable auto-tuning: automatically determines optimal worker count. See https://google-grain.readthedocs.io/en/latest/_autosummary/grain.experimental.pick_performance_config.html
 grain_per_worker_buffer_size: 1
 # num_threads and prefetch_buffer_size are per-worker per-dataset. Used in ReadOptions (https://google-grain.readthedocs.io/en/latest/tutorials/data_loader_tutorial.html#per-worker-readoptions)
 # The default value matches that in the Grain package. If mixing multiple data sources, consider lowering these values to reduce memory usage.
-grain_num_threads: 16  
+grain_num_threads: 16
 grain_prefetch_buffer_size: 500
 grain_worker_count_eval: 1
 grain_per_worker_buffer_size_eval: 1
+grain_ram_budget_mb: 1024 # RAM budget (MB) for auto-tuning worker count. Only used when grain_worker_count is -1.
 grain_num_threads_eval: 16
 grain_prefetch_buffer_size_eval: 500
 grain_data_source_max_workers: 16  # Max workers for ThreadPoolExecutor when mixing multiple Grain data sources.
@@ -930,7 +931,7 @@ temporal_patch_size_for_vit: 2
 num_position_embeddings_for_vit: 1024
 deepstack_visual_indexes_for_vit: []
 
-# Subslice shape in the form of "x,y,z" when using pathways (single controller). 
+# Subslice shape in the form of "x,y,z" when using pathways (single controller).
 # Example: "8,8" to use a 8x8 subgrid (64 chips) of a full pod (16x16) of trillium.
 subslice_shape: ""
 
diff --git a/src/MaxText/configs/types.py b/src/MaxText/configs/types.py
@@ -860,6 +860,7 @@ class GrainDataset(BaseModel):
   grain_per_worker_buffer_size_eval: int = Field(
       1, description="Buffer size for each worker for Grain data loading during evaluation."
   )
+  grain_ram_budget_mb: int = Field(1024, description="RAM budget (MB) for auto-tuning worker count.")
   grain_num_threads: int = Field(16, description="Number of threads for Grain ReadOptions during training.")
   grain_prefetch_buffer_size: int = Field(500, description="Prefetch buffer size for Grain ReadOptions during training.")
   grain_num_threads_eval: int = Field(16, description="Number of threads for Grain ReadOptions during evaluation.")
diff --git a/src/MaxText/input_pipeline/_grain_data_processing.py b/src/MaxText/input_pipeline/_grain_data_processing.py
@@ -23,6 +23,7 @@
 
 import jax
 
+from grain.experimental import pick_performance_config
 import grain.python as grain
 
 from MaxText.utils import gcs_utils
@@ -230,12 +231,20 @@ def pretrain_preprocessing_pipeline(
           axis=1,
       )
   )
-  dataset = dataset.mp_prefetch(
-      grain.MultiprocessingOptions(
+  multiprocessing_options = (
+      pick_performance_config(
+          ds=dataset,
+          ram_budget_mb=config.grain_ram_budget_mb,
+          max_workers=None,
+          max_buffer_size=None,
+      ).multiprocessing_options
+      if grain_worker_count == -1
+      else grain.MultiprocessingOptions(
           num_workers=grain_worker_count,
           per_worker_buffer_size=grain_per_worker_buffer_size,
       )
   )
+  dataset = dataset.mp_prefetch(multiprocessing_options)
   return dataset
 
 
@@ -273,12 +282,20 @@ def dpo_preprocessing_pipeline(
   batch_size = config.global_batch_size_to_load // jax.process_count()
   batch_fn = functools.partial(grain.experimental.batch_and_pad, batch_size=batch_size, pad_value=pad_id)
   dataset = dataset.batch(batch_size, batch_fn=batch_fn)
-  dataset = dataset.mp_prefetch(
-      grain.MultiprocessingOptions(
+  multiprocessing_options = (
+      pick_performance_config(
+          ds=dataset,
+          ram_budget_mb=config.grain_ram_budget_mb,
+          max_workers=None,
+          max_buffer_size=None,
+      ).multiprocessing_options
+      if grain_worker_count == -1
+      else grain.MultiprocessingOptions(
           num_workers=grain_worker_count,
           per_worker_buffer_size=grain_per_worker_buffer_size,
       )
   )
+  dataset = dataset.mp_prefetch(multiprocessing_options)
   return dataset
 
 
diff --git a/tests/grain_data_processing_test.py b/tests/grain_data_processing_test.py
@@ -22,6 +22,7 @@
 import json
 
 import jax
+import pytest
 from jax.sharding import Mesh
 from jax.experimental import mesh_utils
 
@@ -182,6 +183,49 @@ def setUp(self):
     self.train_iter = _grain_data_processing.make_grain_train_iterator(self.config, self.mesh, self.process_indices)
 
 
+class GrainArrayRecordAutoTuneTest(GrainArrayRecordProcessingTest):
+  """Test grain data processing with auto-tuning enabled (grain_worker_count=-1)."""
+
+  def setUp(self):
+    super().setUp()
+    temp_dir = tempfile.gettempdir()
+    self.config = pyconfig.initialize(
+        [sys.argv[0], os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+        per_device_batch_size=1,
+        run_name="test",
+        mesh_axes=["data"],
+        logical_axis_rules=[["batch", "data"]],
+        data_sharding=["data"],
+        base_output_directory="gs://max-experiments/",
+        dataset_type="grain",
+        grain_train_files=os.path.join(
+            temp_dir, "gcsfuse", "array-record", "c4", "en", "3.0.1", "c4-train.array_record*"
+        ),
+        grain_worker_count=-1,  # Enable auto-tuning
+        tokenizer_path=os.path.join(MAXTEXT_ASSETS_ROOT, "tokenizer"),
+        enable_checkpointing=False,
+    )
+    self.mesh_shape_1d = (len(jax.devices()),)
+    self.mesh = Mesh(mesh_utils.create_device_mesh(self.mesh_shape_1d), self.config.mesh_axes)
+    self.process_indices = input_pipeline_interface.get_process_loading_real_data(
+        self.config.data_sharding,
+        self.config.global_batch_size_to_load,
+        self.config.global_batch_size_to_train_on,
+        self.config.max_target_length,
+        self.mesh,
+    )
+    self.train_iter = _grain_data_processing.make_grain_train_iterator(self.config, self.mesh, self.process_indices)
+
+  @pytest.mark.skip(
+      reason=(
+          "Auto-tuning tries multiple numbers of workers during the first few batches "
+          "and it affects batch determinism at first."
+      )
+  )
+  def test_batch_determinism(self):
+    super().test_batch_determinism()
+
+
 class GrainParquetProcessingTest(unittest.TestCase):
 
   @classmethod

Original file line number	Diff line number	Diff line change
`@@ -860,6 +860,7 @@ class GrainDataset(BaseModel):`
`860`	`860`	`grain_per_worker_buffer_size_eval: int = Field(`
`861`	`861`	`1, description="Buffer size for each worker for Grain data loading during evaluation."`
`862`	`862`	`)`
	`863`	`+ grain_ram_budget_mb: int = Field(1024, description="RAM budget (MB) for auto-tuning worker count.")`
`863`	`864`	`grain_num_threads: int = Field(16, description="Number of threads for Grain ReadOptions during training.")`
`864`	`865`	`grain_prefetch_buffer_size: int = Field(500, description="Prefetch buffer size for Grain ReadOptions during training.")`
`865`	`866`	`grain_num_threads_eval: int = Field(16, description="Number of threads for Grain ReadOptions during evaluation.")`