@@ -71,7 +71,7 @@ Clone the `gpu-recipes` repository and set a reference to the recipe folder.
7171git clone https://github.com/ai-hypercomputer/gpu-recipes.git
7272cd gpu-recipes
7373export REPO_ROOT=`git rev-parse --show-toplevel`
74- export RECIPE_ROOT=$REPO_ROOT/training/a4/llama3-1-70b/nemo-pretraining-gke/2_nodes
74+ export RECIPE_ROOT=$REPO_ROOT/training/a4/llama3-1-70b/nemo-pretraining-gke/2node-bf16-seq8192-gbs256
7575cd $RECIPE_ROOT
7676```
7777
@@ -89,7 +89,7 @@ your client:
8989
9090 ```bash
9191 cd $RECIPE_ROOT
92- export WORKLOAD_NAME=$USER-a4-llama3-1-70b-2node
92+ export WORKLOAD_NAME=$USER-a4-llama3-1-70b
9393 helm install $WORKLOAD_NAME . -f values.yaml \
9494 --set-file workload_launcher=launcher.sh \
9595 --set-file workload_config=llama3-1-70b-seq8192-gbs256-mbs1-gpus16.py \
@@ -107,7 +107,7 @@ your client:
107107
108108 ``` bash
109109 cd $RECIPE_ROOT
110- export WORKLOAD_NAME=$USER -a4-llama3-1-70b-2node
110+ export WORKLOAD_NAME=$USER -a4-llama3-1-70b
111111 helm install $WORKLOAD_NAME . -f values.yaml \
112112 --set-file workload_launcher=launcher.sh \
113113 --set-file workload_config=llama3-1-70b-seq8192-gbs256-mbs1-gpus16.py \
@@ -124,7 +124,7 @@ your client:
124124To check the status of pods in your job, run the following command:
125125
126126```
127- kubectl get pods | grep $USER-a4-llama3-1-70b-2node
127+ kubectl get pods | grep $USER-a4-llama3-1-70b
128128```
129129
130130Replace the following:
@@ -141,13 +141,13 @@ Information about the training job's progress, including crucial details such as
141141loss, step count, and step time, is generated by the rank 0 process.
142142This process runs on the pod whose name begins with
143143`JOB_NAME_PREFIX-workload-0-0`.
144- For example: `$USER-a4-llama3-1-70b-2node- workload-0-0-s9zrv`.
144+ For example: `$USER-a4-llama3-1-70b-workload-0-0-s9zrv`.
145145
146146### Uninstall the Helm release
147147
148148You can delete the job and other resources created by the Helm chart. To
149149uninstall Helm, run the following command from your client:
150150
151151```bash
152- helm uninstall $USER-a4-llama3-1-70b-2node
153- ```
152+ helm uninstall $USER-a4-llama3-1-70b
153+ ```
0 commit comments