AI-Hypercomputer
diff --git a/‎benchmarks/globals.py‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/globals.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/maxtext_xpk_runner.py‎
Lines changed: 9 additions & 3 deletions b/‎benchmarks/maxtext_xpk_runner.py‎
Lines changed: 9 additions & 3 deletions
diff --git a/‎benchmarks/recipes/args_helper.py‎
Lines changed: 4 additions & 27 deletions b/‎benchmarks/recipes/args_helper.py‎
Lines changed: 4 additions & 27 deletions
diff --git a/‎benchmarks/recipes/mcjax_long_running_recipe.py‎
Lines changed: 2 additions & 1 deletion b/‎benchmarks/recipes/mcjax_long_running_recipe.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎benchmarks/recipes/parser_utils.py‎
Lines changed: 158 additions & 0 deletions b/‎benchmarks/recipes/parser_utils.py‎
Lines changed: 158 additions & 0 deletions
diff --git a/‎benchmarks/recipes/pw_elastic_training_recipe.py‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/recipes/pw_elastic_training_recipe.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/recipes/pw_headless_mode.py‎
Lines changed: 2 additions & 2 deletions b/‎benchmarks/recipes/pw_headless_mode.py‎
Lines changed: 2 additions & 2 deletions
@@ -17,7 +17,7 @@
 import os.path
 
 # This is the MaxText root: with "max_utils.py"; &etc. TODO: Replace `os.path.basename` with `os.path.abspath`
-MAXTEXT_PKG_DIR = os.environ.get("MAXTEXT_PKG_DIR", "MaxText")
+MAXTEXT_PKG_DIR = os.environ.get("MAXTEXT_PKG_DIR", "src/MaxText")
 
 # This is the maxtext repo root: with ".git" folder; "README.md"; "pyproject.toml"; &etc.
 MAXTEXT_REPO_ROOT = os.environ.get(
 
@@ -583,6 +583,8 @@ def generate_xpk_workload_cmd(
     cluster_config: XpkClusterConfig,
     wl_config: WorkloadConfig,
     workload_name=None,
+    user=os.environ["USER"],
+    temp_key=None,
     exp_name=None,
 ):
   """Generates a command to run a maxtext model on XPK."""
@@ -592,15 +594,19 @@ def generate_xpk_workload_cmd(
 
   time.localtime()
   length_of_random_str = 3
-  temp_post_fix = "".join(random.choice(string.ascii_lowercase + string.digits) for _ in range(length_of_random_str))
+  # Allow DAG to resolve workload name for cleanup, preventing reliance on random IDs
+  if temp_key is not None:
+    temp_post_fix = temp_key
+  else:
+    temp_post_fix = "".join(random.choice(string.ascii_lowercase + string.digits) for _ in range(length_of_random_str))
 
   truncate_model_name = 10
   truncate_prefix = 3
   post_fix = f"-{wl_config.num_slices}-{time.strftime('%m%d%H', time.localtime())}-{temp_post_fix}"
-  common_prefix = os.environ["USER"]
+  common_prefix = user
   pw_prefix = "pw-"
 
-  if workload_name is None:  # Generate name if not provided
+  if workload_name is None:
     if is_pathways_enabled:
       post_fix = f"-{wl_config.num_slices}-{temp_post_fix}"
       name = f"{pw_prefix}{wl_config.model.model_name.replace('_', '-')[:truncate_model_name - len(pw_prefix)]}"
 
@@ -20,7 +20,6 @@
 be used to clean up existing XPK workloads before starting a new run.
 """
 
-import argparse
 import os
 
 from benchmarks.xpk_configs import XpkClusterConfig
@@ -66,41 +65,19 @@ def handle_delete_specific_workload(cluster_config: XpkClusterConfig, workload_n
   os.system(f"yes | {delete_command}")
 
 
-def handle_cmd_args(cluster_config: XpkClusterConfig, *actions: str, **kwargs) -> bool:
+def handle_cmd_args(cluster_config: XpkClusterConfig, is_delete: bool, user: str, **kwargs) -> bool:
   """Parses command-line arguments and executes the specified actions.
 
   Args:
       cluster_config: Contains Cluster configuration information that's helpful
         for running the actions.
-      *actions: Variable number of string arguments representing the actions to
-        be performed.
+      is_delete: A boolean indicating whether the delete action should be
+                 performed.
       **kwargs: Optional keyword arguments to be passed to action handlers.
-
-  Raises:
-    ValueError: If an unsupported action is provided or if unknown arguments are
-    passed.
   """
-
-  parser = argparse.ArgumentParser()
-
-  if DELETE in actions:
-    parser.add_argument(
-        "--delete",
-        action="store_true",
-        help="Delete workloads starting with the user's first five characters.",
-    )
-
-  known_args, unknown_args = parser.parse_known_args()
-
-  if unknown_args:
-    raise ValueError(f"Unrecognized arguments: {unknown_args}")
-
-  # Get user
-  user = os.environ["USER"]
-
   # Handle actions
   should_continue = True
-  if DELETE in actions and known_args.delete:
+  if is_delete:
     _handle_delete(cluster_config, user, **kwargs)
     should_continue = False
 
 
@@ -27,6 +27,7 @@
 import benchmarks.maxtext_trillium_model_configs as model_configs
 import benchmarks.maxtext_xpk_runner as mxr
 from benchmarks.xpk_configs import XpkClusterConfig
+from . import user_configs
 
 # Cluster Params
 CLUSTER = "v6e-256-cluster"
@@ -57,7 +58,7 @@ def main() -> None:
   )
 
   # Handle command line arguments using args_helper
-  should_continue = helper.handle_cmd_args(cluster_config, helper.DELETE, xpk_path=XPK_PATH)
+  should_continue = helper.handle_cmd_args(cluster_config, user_configs.USER_CONFIG.delete, user_configs.USER_CONFIG.user)
 
   if not should_continue:
     return
 
@@ -0,0 +1,158 @@
+# Copyright 2023–2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This module provides utility functions for custom argument parsing 
+and defines a comprehensive set of command-line arguments for configuring a machine learning workload.
+"""
+
+import argparse
+
+
+def parse_int_list(arg):
+  """Parses a string with comma-separated values into a list of integers."""
+  return [int(x) for x in arg.split(",")]
+
+
+def parse_str_list(arg):
+  """Parses a string with space-separated values into a list of strings."""
+  return [s.strip() for s in arg.split(",")]
+
+
+def str2bool(v):
+  """Parses a string representation of a boolean value into a Python boolean."""
+  if isinstance(v, bool):
+    return v
+  if v.lower() in ("true"):
+    return True
+  elif v.lower() in ("false"):
+    return False
+  else:
+    raise argparse.ArgumentTypeError("Boolean value expected (e.g., True or False).")
+
+
+def add_arguments(parser: argparse.ArgumentParser):
+  """Add arguments to arg parsers that need it.
+
+  Args:
+    parser:  parser to add shared arguments to.
+  """
+  # Add the arguments for each parser.
+  # GCP Configuration
+  parser.add_argument("--user", type=str, default="user_name", help="GCP user name.")
+  parser.add_argument(
+      "--cluster_name",
+      type=str,
+      default="test-v5e-32-cluster",
+      help="Name of the TPU cluster.",
+  )
+  parser.add_argument("--project", type=str, default="cloud-tpu-cluster", help="GCP project ID.")
+  parser.add_argument("--zone", type=str, default="us-south1-a", help="GCP zone for the cluster.")
+  parser.add_argument(
+      "--device_type",
+      type=str,
+      default="v5litepod-32",
+      help="Type of TPU device (e.g., v5litepod-32).",
+  )
+  parser.add_argument(
+      "--priority",
+      type=str,
+      choices=["low", "medium", "high", "very high"],
+      default="medium",
+      help="Priority of the job.",
+  )
+
+  # Image Configuration
+  parser.add_argument(
+      "--server_image",
+      type=str,
+      default="us-docker.pkg.dev/cloud-tpu-v2-images/pathways/proxy_server",
+      help="Docker image for the proxy server.",
+  )
+  parser.add_argument(
+      "--proxy_image",
+      type=str,
+      default="us-docker.pkg.dev/cloud-tpu-v2-images/pathways/server",
+      help="Docker image for the server.",
+  )
+  parser.add_argument(
+      "--runner",
+      type=str,
+      default="us-docker.pkg.dev/path/to/maxtext_runner",
+      help="Docker image for the runner.",
+  )
+  parser.add_argument(
+      "--colocated_python_image",
+      type=str,
+      default=None,
+      help="Colocated Python image.",
+  )
+  parser.add_argument("--worker_flags", type=str, default="", help="Worker flags.")
+  parser.add_argument("--proxy_flags", type=str, default="", help="Proxy flags.")
+  parser.add_argument("--server_flags", type=str, default="", help="Server flags.")
+
+  # Model Configuration
+  parser.add_argument("--benchmark_steps", type=int, default=20, help="Number of benchmark steps.")
+  parser.add_argument(
+      "--headless",
+      action=argparse.BooleanOptionalAction,
+      default=False,
+      help="Run in headless mode.",
+  )
+  parser.add_argument(
+      "--selected_model_framework",
+      type=parse_str_list,
+      default=["pathways"],
+      help="List of model frameworks (e.g., pathways, mcjax",
+  )
+  parser.add_argument(
+      "--selected_model_names",
+      type=parse_str_list,
+      default=["llama3_1_8b_8192_v5e_256"],
+      help="List of model names (e.g., llama3_1_8b_8192_v5e_256, llama2-7b-v5e-256",
+  )
+  parser.add_argument(
+      "--num_slices_list",
+      type=parse_int_list,
+      default=[2],
+      help="List of number of slices.",
+  )
+
+  # BigQuery configuration
+  parser.add_argument(
+      "--bq_enable",
+      type=str2bool,
+      default=False,
+      help="Enable BigQuery logging. Must be True or False. Defaults to False.",
+  )
+
+  parser.add_argument(
+      "--bq_db_project",
+      type=str,
+      default="",
+      help="BigQuery project ID where the logging dataset resides.",
+  )
+
+  parser.add_argument(
+      "--bq_db_dataset",
+      type=str,
+      default="",
+      help="BigQuery dataset name where metrics will be written.",
+  )
+
+  # Other configurations
+  parser.add_argument("--xpk_path", type=str, default="~/xpk", help="Path to xpk.")
+  parser.add_argument("--delete", action="store_true", help="Delete the cluster workload")
+  parser.add_argument("--max_restarts", type=int, default=0, help="Maximum number of restarts")
+  parser.add_argument("--temp_key", type=str, default=None, help="Temporary placeholder code")
@@ -45,7 +45,7 @@ def main() -> None:
   """Main function to run the elastic training disruption test."""
   user_configs.USER_CONFIG.headless = False
   should_continue = helper.handle_cmd_args(
-      user_configs.USER_CONFIG.cluster_config, helper.DELETE, xpk_path=user_configs.USER_CONFIG.xpk_path
+      user_configs.USER_CONFIG.cluster_config, user_configs.USER_CONFIG.delete, user_configs.USER_CONFIG.user
   )
 
   if not should_continue:
 
@@ -23,12 +23,12 @@
 
 import benchmarks.recipes.args_helper as helper
 import maxtext_xpk_runner as mxr
-from recipes.user_configs import cluster_config, xpk_path, pathways_config, base_output_directory, headless_workload_name
+from recipes.user_configs import cluster_config, xpk_path, pathways_config, base_output_directory, headless_workload_name, delete, user
 
 
 def main() -> int:
   # Handle command line arguments using args_helper
-  should_continue = helper.handle_cmd_args(cluster_config, helper.DELETE, xpk_path=xpk_path)
+  should_continue = helper.handle_cmd_args(cluster_config, delete, user)
 
   if not should_continue:
     return 0
Original file line number	Diff line number	Diff line change
`@@ -45,7 +45,7 @@ def main() -> None:`
`45`	`45`	`"""Main function to run the elastic training disruption test."""`
`46`	`46`	`user_configs.USER_CONFIG.headless = False`
`47`	`47`	`should_continue = helper.handle_cmd_args(`
`48`		`- user_configs.USER_CONFIG.cluster_config, helper.DELETE, xpk_path=user_configs.USER_CONFIG.xpk_path`
	`48`	`+ user_configs.USER_CONFIG.cluster_config, user_configs.USER_CONFIG.delete, user_configs.USER_CONFIG.user`
`49`	`49`	`)`
`50`	`50`
`51`	`51`	`if not should_continue:`