diff --git a/3.test_cases/pytorch/verl/rlvr/managed-tiered-checkpointing/README.md b/3.test_cases/pytorch/verl/rlvr/managed-tiered-checkpointing/README.md new file mode 100644 index 000000000..501fafcf5 --- /dev/null +++ b/3.test_cases/pytorch/verl/rlvr/managed-tiered-checkpointing/README.md @@ -0,0 +1,64 @@ +# MTC GRPO Training with RayCluster + +This directory contains configurations for running GRPO training with VERL with [HyperPod Managed Tiered Checkpointing](https://docs.aws.amazon.com/sagemaker/latest/dg/managed-tier-checkpointing.html). + +## Files + +- `mtc-grpo-cluster.yaml` - RayCluster configuration +- `submit-mtc-grpo.sh` - Script to submit the GRPO training job to the Ray cluster + +## Setup + +1. Source environment variables: +```bash +# 1. Load environment variables +source setup/env_vars +``` + +2. Create Service Account for your pods to have S3 access. To do this, please read the [IRSA-README.md](../setup/IRSA-README.md). + +## Deploy the RayCluster +``` +envsubst < managed-tiered-checkpointing/mtc-grpo-cluster.yaml | kubectl apply -f - +``` + +## Clone MTC-enabled VERL Code +Delete existing verl repo if you already cloned: +``` +rm -rf verl +``` + +Clone MTC-enabled VERL code. This is a fork from the main VERL repo that has modified checkpointing code to enabled managed tiered checkpointing: +``` +git clone https://github.com/aruncs2005/verl.git +``` + +## Submit the training job +``` +./managed-tiered-checkpointing/submit-mtc-grpo.sh +``` + +## Monitoring + +- **Ray Dashboard**: http://localhost:8265 (after port forwarding) +- **View logs**: `kubectl logs -f ` +- **Check job status**: `ray job status ` +- **Follow job logs**: `ray job logs --follow` + +## Configuration + +Edit `submit-mtc-grpo.sh` to modify training parameters: + +- `train_prompt_bsz` - Training batch size +- `train_prompt_mini_bsz` - Mini batch size for PPO +- `train_prompt_micro_bsz_per_gpu` - Micro batch size per GPU +- `n_resp_per_prompt` - Number of responses per prompt +- `gen_tp` - Tensor parallelism for generation +- Model path, data paths, S3 checkpoint location, etc. + +## Cleanup + +```bash +# Delete the RayCluster +kubectl delete raycluster mtc-grpo-cluster +``` \ No newline at end of file diff --git a/3.test_cases/pytorch/verl/rlvr/managed-tiered-checkpointing/mtc-grpo-cluster.yaml b/3.test_cases/pytorch/verl/rlvr/managed-tiered-checkpointing/mtc-grpo-cluster.yaml new file mode 100644 index 000000000..b816ca4db --- /dev/null +++ b/3.test_cases/pytorch/verl/rlvr/managed-tiered-checkpointing/mtc-grpo-cluster.yaml @@ -0,0 +1,162 @@ +apiVersion: ray.io/v1alpha1 +kind: RayCluster +metadata: + name: mtc-grpo-cluster + labels: + controller-tools.k8s.io: "1.0" + annotations: + karpenter.sh/do-not-disrupt: "true" +spec: + # Ray head pod template + headGroupSpec: + rayStartParams: + dashboard-host: '0.0.0.0' + metrics-export-port: '8080' + template: + spec: + serviceAccountName: ray-s3-sa + nodeSelector: + node.kubernetes.io/instance-type: $INSTANCE_TYPE + sagemaker.amazonaws.com/node-health-status: Schedulable + securityContext: + runAsUser: 0 + runAsGroup: 0 + fsGroup: 0 + containers: + - name: ray-head + image: ${REGISTRY}${IMAGE}:${TAG} + env: + ## PROMETHEUS AND GRAFANA + - name: RAY_GRAFANA_IFRAME_HOST + value: http://localhost:3000 + - name: RAY_GRAFANA_HOST + value: http://prometheus-grafana.prometheus-system.svc:80 + - name: RAY_PROMETHEUS_HOST + value: http://prometheus-kube-prometheus-prometheus.prometheus-system.svc:9090 + ## EFA AND NCCL CONFIGURATION + - name: FI_PROVIDER + value: "efa" + - name: FI_EFA_USE_DEVICE_RDMA + value: "1" + - name: FI_EFA_FORK_SAFE + value: "1" + - name: NCCL_PROTO + value: "simple" + - name: NCCL_SOCKET_IFNAME + value: "^docker,lo,veth" + - name: NCCL_DEBUG + value: "INFO" + - name: TORCH_NCCL_DUMP_ON_TIMEOUT + value: "1" + - name: TORCH_NCCL_ASYNC_ERROR_HANDLING + value: "1" + - name: HF_TOKEN + value: ${HF_TOKEN} + lifecycle: + preStop: + exec: + command: ["/bin/sh","-c","ray stop"] + resources: + limits: + cpu: 8 + memory: 32Gi + requests: + cpu: 8 + memory: 32Gi + ports: + - containerPort: 6379 + name: gcs-server + - containerPort: 8265 + name: dashboard + - containerPort: 10001 + name: client + - containerPort: 8000 + name: serve + - containerPort: 8080 + name: metrics + volumeMounts: + - name: fsx-storage + mountPath: /fsx + - name: ray-logs + mountPath: /tmp/ray + - name: checkpoint-logs + mountPath: /var/log/sagemaker_checkpointing + volumes: + - name: ray-logs + emptyDir: {} + - name: fsx-storage + persistentVolumeClaim: + claimName: fsx-claim + - name: checkpoint-logs + hostPath: + path: /var/logs/sagemaker_checkpointing + type: DirectoryOrCreate + workerGroupSpecs: + - replicas: $NUM_NODES + minReplicas: 1 + maxReplicas: 10 + groupName: gpu-group + rayStartParams: + num-gpus: "$NUM_GPU_PER_NODE" + metrics-export-port: '8080' + template: + spec: + serviceAccountName: ray-s3-sa + nodeSelector: + node.kubernetes.io/instance-type: $INSTANCE_TYPE + sagemaker.amazonaws.com/node-health-status: Schedulable + securityContext: + runAsUser: 0 + runAsGroup: 0 + fsGroup: 0 + containers: + - name: ray-worker + image: ${REGISTRY}${IMAGE}:${TAG} + env: + - name: FI_PROVIDER + value: "efa" + - name: FI_EFA_USE_DEVICE_RDMA + value: "1" + - name: FI_EFA_FORK_SAFE + value: "1" + - name: NCCL_PROTO + value: "simple" + - name: NCCL_SOCKET_IFNAME + value: "^docker,lo,veth" + - name: NCCL_DEBUG + value: "INFO" + - name: TORCH_NCCL_DUMP_ON_TIMEOUT + value: "1" + - name: TORCH_NCCL_ASYNC_ERROR_HANDLING + value: "1" + - name: HF_TOKEN + value: ${HF_TOKEN} + lifecycle: + preStop: + exec: + command: ["/bin/sh","-c","ray stop"] + resources: + limits: + nvidia.com/gpu: $NUM_GPU_PER_NODE + requests: + nvidia.com/gpu: $NUM_GPU_PER_NODE + ports: + - containerPort: 8080 + name: metrics + volumeMounts: + - name: ray-logs + mountPath: /tmp/ray + - name: fsx-storage + mountPath: /fsx + - name: checkpoint-logs + mountPath: /var/log/sagemaker_checkpointing + volumes: + - name: fsx-storage + persistentVolumeClaim: + claimName: fsx-claim + - name: ray-logs + emptyDir: {} + - name: checkpoint-logs + hostPath: + path: /var/logs/sagemaker_checkpointing + type: DirectoryOrCreate diff --git a/3.test_cases/pytorch/verl/rlvr/managed-tiered-checkpointing/submit-mtc-grpo.sh b/3.test_cases/pytorch/verl/rlvr/managed-tiered-checkpointing/submit-mtc-grpo.sh new file mode 100644 index 000000000..4680264cf --- /dev/null +++ b/3.test_cases/pytorch/verl/rlvr/managed-tiered-checkpointing/submit-mtc-grpo.sh @@ -0,0 +1,122 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +# Load environment variables +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "${SCRIPT_DIR}/../setup/env_vars" + +# Project configuration +project_name='verl_grpo_example_gsm8k' +exp_name='qwen3_0.6b_function_rm' + +# GRPO Algorithm parameters +adv_estimator=grpo +use_kl_in_reward=False +use_kl_loss=True +kl_loss_coef=0.001 +kl_loss_type=low_var_kl +entropy_coeff=0 + +# Token length configuration +max_prompt_length=512 +max_response_length=1024 +filter_overlong_prompts=True +truncation='error' + +# Training configuration +train_prompt_bsz=${TRAIN_BATCH_SIZE:-32} # Total batch size +gen_prompt_bsz=${GEN_BATCH_SIZE:-$train_prompt_bsz} +n_resp_per_prompt=${N_RESP_PER_PROMPT:-5} +train_prompt_mini_bsz=32 # Must be <= train_batch_size +train_prompt_micro_bsz_per_gpu=1 + +# Ray configuration +RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} + +# Cluster configuration +NNODES=${NUM_NODES:-4} +GPUS_PER_NODE=${NUM_GPU_PER_NODE:-4} + +# Model and data paths +MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"} +RAY_DATA_HOME=${RAY_DATA_HOME:-"/fsx/verl"} + +# Data files - using GSM8K dataset +TRAIN_FILE="${RAY_DATA_HOME}/data/gsm8k/train.parquet" +TEST_FILE="${RAY_DATA_HOME}/data/gsm8k/test.parquet" + +# S3 checkpoint configuration +S3_CHECKPOINT_BASE=${S3_CHECKPOINT_BASE:-"s3://s3-bucket-example"} +# Performance parameters +gen_tp=2 +log_prob_micro_bsz_per_gpu=32 +gpu_memory_utilization=0.6 + +# Memory optimization +param_offload=False +optimizer_offload=False +ref_param_offload=True + +# Print configuration for verification +echo "=== MTC GRPO Training Configuration ===" +echo "Project: ${project_name}" +echo "Experiment: ${exp_name}" +echo "Model: ${MODEL_PATH}" +echo "Nodes: ${NNODES}" +echo "GPUs per node: ${GPUS_PER_NODE}" +echo "Total GPUs: $((NNODES * GPUS_PER_NODE))" +echo "Data home: ${RAY_DATA_HOME}" +echo "S3 Checkpoints: ${S3_CHECKPOINT_BASE}" +echo "Ray address: ${RAY_ADDRESS}" +echo "==================================" + +# Submit Ray job +ray job submit --no-wait \ + --address "${RAY_ADDRESS}" \ + --working-dir "${WORKING_DIR}" \ + -- python3 -m verl.trainer.main_ppo \ + algorithm.adv_estimator=${adv_estimator} \ + data.train_files="${TRAIN_FILE}" \ + data.val_files="${TEST_FILE}" \ + data.prompt_key=question \ + data.train_batch_size=${train_prompt_bsz} \ + data.max_prompt_length=${max_prompt_length} \ + data.max_response_length=${max_response_length} \ + data.filter_overlong_prompts=${filter_overlong_prompts} \ + data.truncation=${truncation} \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=${train_prompt_micro_bsz_per_gpu} \ + actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ + actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ + actor_rollout_ref.actor.kl_loss_type=${kl_loss_type} \ + actor_rollout_ref.actor.entropy_coeff=${entropy_coeff} \ + actor_rollout_ref.actor.fsdp_config.param_offload=${param_offload} \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=${optimizer_offload} \ + actor_rollout_ref.actor.checkpoint.s3_base_path=${S3_CHECKPOINT_BASE} \ + actor_rollout_ref.actor.checkpoint.ckpt_namespace=mtc-grpo-$(date +%s) \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=${log_prob_micro_bsz_per_gpu} \ + actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.gpu_memory_utilization=${gpu_memory_utilization} \ + actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=${log_prob_micro_bsz_per_gpu} \ + actor_rollout_ref.ref.fsdp_config.param_offload=${ref_param_offload} \ + algorithm.use_kl_in_reward=${use_kl_in_reward} \ + trainer.critic_warmup=0 \ + trainer.logger='["console"]' \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.n_gpus_per_node=${GPUS_PER_NODE} \ + trainer.nnodes=${NNODES} \ + trainer.save_freq=1 \ + trainer.test_freq=2 \ + trainer.total_epochs=5 \ + trainer.s3_base_path=${S3_CHECKPOINT_BASE} + +echo "" +echo "Job submitted! Check status with: ray job status " +echo "Or view logs with: ray job logs --follow" diff --git a/3.test_cases/pytorch/verl/rlvr/setup/IRSA-README.md b/3.test_cases/pytorch/verl/rlvr/setup/IRSA-README.md new file mode 100644 index 000000000..3b8ef276c --- /dev/null +++ b/3.test_cases/pytorch/verl/rlvr/setup/IRSA-README.md @@ -0,0 +1,166 @@ +# IRSA Setup for Ray Pods S3 Access + +This guide explains how to set up IAM Roles for Service Accounts (IRSA) to give your Ray pods access to S3 for managed tiered checkpointing. + +## Prerequisites + +- `eksctl` installed (for OIDC provider setup) +- AWS CLI configured with appropriate permissions +- kubectl configured to access your EKS cluster + +## Quick Setup + +Run the automated setup script: + +```bash +./setup/setup-irsa.sh +``` + +This script will: +1. Check/create OIDC provider for your EKS cluster +2. Create an IAM policy with S3 full access +3. Create an IAM role with trust policy for the service account +4. Attach the policy to the role +5. Create a Kubernetes service account with the IAM role annotation + +## What Gets Created + +### IAM Policy +- **Name**: `ray-s3-access-policy` +- **Permissions**: Full S3 access (`s3:*`) + +### IAM Role +- **Name**: `ray-s3-access-role` +- **Trust Policy**: Allows the Kubernetes service account to assume this role via OIDC + +### Kubernetes Service Account +- **Name**: `ray-s3-sa` +- **Namespace**: `default` +- **Annotation**: Links to the IAM role ARN + +## How It Works + +1. **OIDC Provider**: EKS cluster has an OIDC identity provider that allows Kubernetes service accounts to authenticate with AWS IAM +2. **Service Account**: Ray pods use a Kubernetes service account annotated with an IAM role ARN +3. **IAM Role**: The IAM role has a trust policy that allows the service account to assume it +4. **Credentials**: AWS SDK automatically retrieves temporary credentials via the OIDC token + +## Verification + +After running the setup, verify the configuration: + +```bash +# Check service account +kubectl get sa ray-s3-sa -n default -o yaml + +# Check IAM role +aws iam get-role --role-name ray-s3-access-role + +# Check policy attachment +aws iam list-attached-role-policies --role-name ray-s3-access-role +``` + +## Using with RayCluster + +The RayCluster YAML has been updated to use this service account. Both head and worker pods will have: + +```yaml +spec: + serviceAccountName: ray-s3-sa +``` + +This gives them automatic S3 access without needing to manage credentials. + +## Testing S3 Access + +Once the cluster is deployed, you can test S3 access from a pod: + +```bash +# Get a pod name +POD_NAME=$(kubectl get pods -l ray.io/node-type=head -o jsonpath='{.items[0].metadata.name}') + +# Test S3 access +kubectl exec -it $POD_NAME -- aws s3 ls s3://sagemaker-mvincig-rlvr-e66849d3-bucket/ +``` + +## Troubleshooting + +### OIDC Provider Not Found +If you get an error about OIDC provider, install eksctl: +```bash +# macOS +brew install eksctl + +# Linux +curl --silent --location "https://github.com/weaveworks/eksctl/releases/latest/download/eksctl_$(uname -s)_amd64.tar.gz" | tar xz -C /tmp +sudo mv /tmp/eksctl /usr/local/bin +``` + +### Credentials Not Working +1. Check service account annotation: + ```bash + kubectl get sa ray-s3-sa -o yaml + ``` +2. Check pod has the service account: + ```bash + kubectl get pod -o yaml | grep serviceAccountName + ``` +3. Check environment variables in pod: + ```bash + kubectl exec -it -- env | grep AWS + ``` + +### Permission Denied +1. Verify IAM role has the policy attached +2. Check the trust policy allows your service account +3. Ensure the OIDC provider ARN matches in the trust policy + +## Security Considerations + +The current setup grants full S3 access (`s3:*`). For production, consider: + +1. **Restrict to specific buckets**: + ```json + "Resource": [ + "arn:aws:s3:::sagemaker-mvincig-rlvr-e66849d3-bucket", + "arn:aws:s3:::sagemaker-mvincig-rlvr-e66849d3-bucket/*" + ] + ``` + +2. **Limit actions**: + ```json + "Action": [ + "s3:GetObject", + "s3:PutObject", + "s3:DeleteObject", + "s3:ListBucket" + ] + ``` + +3. **Add conditions** for additional security + +## Cleanup + +To remove the IRSA setup: + +```bash +# Delete service account +kubectl delete sa ray-s3-sa -n default + +# Detach policy from role +aws iam detach-role-policy \ + --role-name ray-s3-access-role \ + --policy-arn arn:aws:iam::$(aws sts get-caller-identity --query Account --output text):policy/ray-s3-access-policy + +# Delete IAM role +aws iam delete-role --role-name ray-s3-access-role + +# Delete IAM policy +aws iam delete-policy \ + --policy-arn arn:aws:iam::$(aws sts get-caller-identity --query Account --output text):policy/ray-s3-access-policy +``` + +## References + +- [EKS IRSA Documentation](https://docs.aws.amazon.com/eks/latest/userguide/iam-roles-for-service-accounts.html) +- [SageMaker HyperPod Managed Tiered Checkpointing](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-hyperpod-managed-tiered-checkpointing.html) diff --git a/3.test_cases/pytorch/verl/rlvr/setup/env_vars.example b/3.test_cases/pytorch/verl/rlvr/setup/env_vars.example index 8e2bc92ef..c6821c1df 100644 --- a/3.test_cases/pytorch/verl/rlvr/setup/env_vars.example +++ b/3.test_cases/pytorch/verl/rlvr/setup/env_vars.example @@ -23,6 +23,8 @@ export VERL_HOME="fsx/verl" export RAY_DASHBOARD_PORT=8265 # Local port for Ray dashboard (forwarded from cluster) export RAY_ADDRESS="http://localhost:${RAY_DASHBOARD_PORT}" export WORKING_DIR="$(pwd)/verl" +export RAY_NAMESPACE="default" # Namespace where RayCluster runs + # Job Env Vars (using NUM_NODES for consistency) export HF_TOKEN= @@ -31,16 +33,11 @@ export NCCL_DEBUG=INFO # Memory optimization settings export RAY_memory_usage_threshold=0.85 -# Add these to reduce I/O pressure -export TRAIN_BATCH_SIZE=32 # Reduced from 512 -export GEN_BATCH_SIZE=384 # Reduced from 1536 -export N_RESP_PER_PROMPT=2 # Reduced from 16 +# Training parameters +export TRAIN_BATCH_SIZE=32 +export GEN_BATCH_SIZE=384 +export N_RESP_PER_PROMPT=2 -# Observability with HyperPod - Amazon Managed Prometheus & Grafana -export AMP_WORKSPACE_ID="ws-xxxxxxxxxxxxxxxxx" -export AMP_ENDPOINT="https://aps-workspaces.${AWS_REGION}.amazonaws.com/workspaces/${AMP_WORKSPACE_ID}" -export GRAFANA_WORKSPACE_ID="g-xxxxxxxxxxxxxxxxx" -export GRAFANA_ENDPOINT="https://${GRAFANA_WORKSPACE_ID}.grafana-workspace.${AWS_REGION}.amazonaws.com" -export CLUSTER_ID="" -export CLUSTER_NAME="" -export RAY_NAMESPACE="default" # Namespace where RayCluster runs +# Managed Tiered Checkpointing with HyperPod +# S3 checkpoint configuration +S3_CHECKPOINT_BASE="s3:///checkpoints" diff --git a/3.test_cases/pytorch/verl/rlvr/setup/setup-irsa.sh b/3.test_cases/pytorch/verl/rlvr/setup/setup-irsa.sh new file mode 100644 index 000000000..0fed69afb --- /dev/null +++ b/3.test_cases/pytorch/verl/rlvr/setup/setup-irsa.sh @@ -0,0 +1,165 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Load environment variables +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "${SCRIPT_DIR}/env_vars" + +# Configuration +EKS_CLUSTER_NAME="${EKS_CLUSTER_NAME}" +AWS_REGION="${AWS_REGION}" +AWS_ACCOUNT="${ACCOUNT}" +NAMESPACE="default" +SERVICE_ACCOUNT_NAME="ray-s3-sa" +IAM_ROLE_NAME="ray-s3-access-role" +IAM_POLICY_NAME="ray-s3-access-policy" + +echo "=== Setting up IRSA for Ray Pods ===" +echo "EKS Cluster: ${EKS_CLUSTER_NAME}" +echo "Region: ${AWS_REGION}" +echo "Account: ${AWS_ACCOUNT}" +echo "Namespace: ${NAMESPACE}" +echo "Service Account: ${SERVICE_ACCOUNT_NAME}" +echo "" + +# Step 1: Check if OIDC provider exists for the cluster +echo "Step 1: Checking OIDC provider..." +OIDC_ID=$(aws eks describe-cluster --name ${EKS_CLUSTER_NAME} --region ${AWS_REGION} --query "cluster.identity.oidc.issuer" --output text | cut -d '/' -f 5) + +if [ -z "$OIDC_ID" ]; then + echo "ERROR: Could not get OIDC ID from cluster" + exit 1 +fi + +echo "OIDC ID: ${OIDC_ID}" + +# Check if OIDC provider exists in IAM +OIDC_PROVIDER_ARN="arn:aws:iam::${AWS_ACCOUNT}:oidc-provider/oidc.eks.${AWS_REGION}.amazonaws.com/id/${OIDC_ID}" +if aws iam get-open-id-connect-provider --open-id-connect-provider-arn ${OIDC_PROVIDER_ARN} 2>/dev/null; then + echo "✓ OIDC provider already exists" +else + echo "Creating OIDC provider..." + eksctl utils associate-iam-oidc-provider --cluster=${EKS_CLUSTER_NAME} --region=${AWS_REGION} --approve + echo "✓ OIDC provider created" +fi + +echo "" + +# Step 2: Create IAM policy for S3 access +echo "Step 2: Creating IAM policy for S3 access..." + +# Check if policy already exists +POLICY_ARN="arn:aws:iam::${AWS_ACCOUNT}:policy/${IAM_POLICY_NAME}" +if aws iam get-policy --policy-arn ${POLICY_ARN} 2>/dev/null; then + echo "✓ IAM policy already exists: ${POLICY_ARN}" +else + cat > /tmp/ray-s3-policy.json </dev/null; then + echo "✓ IAM role already exists: ${IAM_ROLE_NAME}" +else + cat > /tmp/trust-policy.json < /tmp/ray-service-account.yaml <