diff --git a/3.test_cases/pytorch/verl/rlvr/managed-tiered-checkpointing/README.md b/3.test_cases/pytorch/verl/rlvr/managed-tiered-checkpointing/README.md
new file mode 100644
index 000000000..501fafcf5
--- /dev/null
+++ b/3.test_cases/pytorch/verl/rlvr/managed-tiered-checkpointing/README.md
@@ -0,0 +1,64 @@
+# MTC GRPO Training with RayCluster
+
+This directory contains configurations for running GRPO training with VERL with [HyperPod Managed Tiered Checkpointing](https://docs.aws.amazon.com/sagemaker/latest/dg/managed-tier-checkpointing.html).
+
+## Files
+
+- `mtc-grpo-cluster.yaml` - RayCluster configuration
+- `submit-mtc-grpo.sh` - Script to submit the GRPO training job to the Ray cluster
+
+## Setup
+
+1. Source environment variables:
+```bash
+# 1. Load environment variables
+source setup/env_vars
+```
+
+2. Create Service Account for your pods to have S3 access. To do this, please read the [IRSA-README.md](../setup/IRSA-README.md). 
+
+## Deploy the RayCluster
+```
+envsubst < managed-tiered-checkpointing/mtc-grpo-cluster.yaml | kubectl apply -f -
+```
+
+## Clone MTC-enabled VERL Code
+Delete existing verl repo if you already cloned:
+```
+rm -rf verl
+```
+
+Clone MTC-enabled VERL code. This is a fork from the main VERL repo that has modified checkpointing code to enabled managed tiered checkpointing:
+```
+git clone https://github.com/aruncs2005/verl.git
+```
+
+## Submit the training job
+```
+./managed-tiered-checkpointing/submit-mtc-grpo.sh
+```
+
+## Monitoring
+
+- **Ray Dashboard**: http://localhost:8265 (after port forwarding)
+- **View logs**: `kubectl logs -f <head-pod-name>`
+- **Check job status**: `ray job status <job-id>`
+- **Follow job logs**: `ray job logs <job-id> --follow`
+
+## Configuration
+
+Edit `submit-mtc-grpo.sh` to modify training parameters:
+
+- `train_prompt_bsz` - Training batch size
+- `train_prompt_mini_bsz` - Mini batch size for PPO
+- `train_prompt_micro_bsz_per_gpu` - Micro batch size per GPU
+- `n_resp_per_prompt` - Number of responses per prompt
+- `gen_tp` - Tensor parallelism for generation
+- Model path, data paths, S3 checkpoint location, etc.
+
+## Cleanup
+
+```bash
+# Delete the RayCluster
+kubectl delete raycluster mtc-grpo-cluster
+```
\ No newline at end of file
diff --git a/3.test_cases/pytorch/verl/rlvr/managed-tiered-checkpointing/mtc-grpo-cluster.yaml b/3.test_cases/pytorch/verl/rlvr/managed-tiered-checkpointing/mtc-grpo-cluster.yaml
new file mode 100644
index 000000000..b816ca4db
--- /dev/null
+++ b/3.test_cases/pytorch/verl/rlvr/managed-tiered-checkpointing/mtc-grpo-cluster.yaml
@@ -0,0 +1,162 @@
+apiVersion: ray.io/v1alpha1
+kind: RayCluster
+metadata:
+  name: mtc-grpo-cluster
+  labels:
+    controller-tools.k8s.io: "1.0"
+  annotations:
+    karpenter.sh/do-not-disrupt: "true"
+spec:
+  # Ray head pod template
+  headGroupSpec:
+    rayStartParams:
+      dashboard-host: '0.0.0.0'
+      metrics-export-port: '8080'
+    template:
+      spec:
+        serviceAccountName: ray-s3-sa
+        nodeSelector:
+          node.kubernetes.io/instance-type: $INSTANCE_TYPE
+          sagemaker.amazonaws.com/node-health-status: Schedulable
+        securityContext:
+          runAsUser: 0
+          runAsGroup: 0
+          fsGroup: 0
+        containers:
+        - name: ray-head
+          image: ${REGISTRY}${IMAGE}:${TAG}
+          env:
+            ## PROMETHEUS AND GRAFANA
+            - name: RAY_GRAFANA_IFRAME_HOST
+              value: http://localhost:3000
+            - name: RAY_GRAFANA_HOST
+              value: http://prometheus-grafana.prometheus-system.svc:80
+            - name: RAY_PROMETHEUS_HOST
+              value: http://prometheus-kube-prometheus-prometheus.prometheus-system.svc:9090
+            ## EFA AND NCCL CONFIGURATION
+            - name: FI_PROVIDER
+              value: "efa"
+            - name: FI_EFA_USE_DEVICE_RDMA
+              value: "1"
+            - name: FI_EFA_FORK_SAFE
+              value: "1"
+            - name: NCCL_PROTO
+              value: "simple"
+            - name: NCCL_SOCKET_IFNAME
+              value: "^docker,lo,veth"
+            - name: NCCL_DEBUG
+              value: "INFO"
+            - name: TORCH_NCCL_DUMP_ON_TIMEOUT
+              value: "1"
+            - name: TORCH_NCCL_ASYNC_ERROR_HANDLING
+              value: "1"
+            - name: HF_TOKEN
+              value: ${HF_TOKEN}
+          lifecycle:
+            preStop:
+              exec:
+                command: ["/bin/sh","-c","ray stop"]
+          resources:
+            limits:
+              cpu: 8
+              memory: 32Gi
+            requests:
+              cpu: 8
+              memory: 32Gi
+          ports:
+          - containerPort: 6379
+            name: gcs-server
+          - containerPort: 8265
+            name: dashboard
+          - containerPort: 10001
+            name: client
+          - containerPort: 8000
+            name: serve
+          - containerPort: 8080
+            name: metrics
+          volumeMounts:
+          - name: fsx-storage
+            mountPath: /fsx
+          - name: ray-logs
+            mountPath: /tmp/ray
+          - name: checkpoint-logs
+            mountPath: /var/log/sagemaker_checkpointing
+        volumes:
+          - name: ray-logs
+            emptyDir: {}
+          - name: fsx-storage
+            persistentVolumeClaim:
+              claimName: fsx-claim
+          - name: checkpoint-logs
+            hostPath:
+              path: /var/logs/sagemaker_checkpointing
+              type: DirectoryOrCreate
+  workerGroupSpecs:
+  - replicas: $NUM_NODES
+    minReplicas: 1
+    maxReplicas: 10
+    groupName: gpu-group
+    rayStartParams:
+      num-gpus: "$NUM_GPU_PER_NODE"
+      metrics-export-port: '8080'
+    template:
+      spec:
+        serviceAccountName: ray-s3-sa
+        nodeSelector:
+          node.kubernetes.io/instance-type: $INSTANCE_TYPE
+          sagemaker.amazonaws.com/node-health-status: Schedulable
+        securityContext:
+          runAsUser: 0
+          runAsGroup: 0
+          fsGroup: 0
+        containers:
+        - name: ray-worker
+          image: ${REGISTRY}${IMAGE}:${TAG}
+          env:
+            - name: FI_PROVIDER
+              value: "efa"
+            - name: FI_EFA_USE_DEVICE_RDMA
+              value: "1"
+            - name: FI_EFA_FORK_SAFE
+              value: "1"
+            - name: NCCL_PROTO
+              value: "simple"
+            - name: NCCL_SOCKET_IFNAME
+              value: "^docker,lo,veth"
+            - name: NCCL_DEBUG
+              value: "INFO"
+            - name: TORCH_NCCL_DUMP_ON_TIMEOUT
+              value: "1"
+            - name: TORCH_NCCL_ASYNC_ERROR_HANDLING
+              value: "1"
+            - name: HF_TOKEN
+              value: ${HF_TOKEN}
+          lifecycle:
+            preStop:
+              exec:
+                command: ["/bin/sh","-c","ray stop"]
+          resources:
+            limits:
+              nvidia.com/gpu: $NUM_GPU_PER_NODE
+            requests:
+              nvidia.com/gpu: $NUM_GPU_PER_NODE
+          ports:
+          - containerPort: 8080
+            name: metrics
+          volumeMounts:
+          - name: ray-logs
+            mountPath: /tmp/ray
+          - name: fsx-storage
+            mountPath: /fsx
+          - name: checkpoint-logs
+            mountPath: /var/log/sagemaker_checkpointing
+        volumes:
+        - name: fsx-storage
+          persistentVolumeClaim:
+            claimName: fsx-claim
+        - name: ray-logs
+          emptyDir: {}
+        - name: checkpoint-logs
+          hostPath:
+            path: /var/logs/sagemaker_checkpointing
+            type: DirectoryOrCreate
diff --git a/3.test_cases/pytorch/verl/rlvr/managed-tiered-checkpointing/submit-mtc-grpo.sh b/3.test_cases/pytorch/verl/rlvr/managed-tiered-checkpointing/submit-mtc-grpo.sh
new file mode 100644
index 000000000..4680264cf
--- /dev/null
+++ b/3.test_cases/pytorch/verl/rlvr/managed-tiered-checkpointing/submit-mtc-grpo.sh
@@ -0,0 +1,122 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+
+# Load environment variables
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "${SCRIPT_DIR}/../setup/env_vars"
+
+# Project configuration
+project_name='verl_grpo_example_gsm8k'
+exp_name='qwen3_0.6b_function_rm'
+
+# GRPO Algorithm parameters
+adv_estimator=grpo
+use_kl_in_reward=False
+use_kl_loss=True
+kl_loss_coef=0.001
+kl_loss_type=low_var_kl
+entropy_coeff=0
+
+# Token length configuration
+max_prompt_length=512
+max_response_length=1024
+filter_overlong_prompts=True
+truncation='error'
+
+# Training configuration
+train_prompt_bsz=${TRAIN_BATCH_SIZE:-32}  # Total batch size
+gen_prompt_bsz=${GEN_BATCH_SIZE:-$train_prompt_bsz}
+n_resp_per_prompt=${N_RESP_PER_PROMPT:-5}
+train_prompt_mini_bsz=32  # Must be <= train_batch_size
+train_prompt_micro_bsz_per_gpu=1
+
+# Ray configuration
+RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
+
+# Cluster configuration
+NNODES=${NUM_NODES:-4}
+GPUS_PER_NODE=${NUM_GPU_PER_NODE:-4}
+
+# Model and data paths
+MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"}
+RAY_DATA_HOME=${RAY_DATA_HOME:-"/fsx/verl"}
+
+# Data files - using GSM8K dataset
+TRAIN_FILE="${RAY_DATA_HOME}/data/gsm8k/train.parquet"
+TEST_FILE="${RAY_DATA_HOME}/data/gsm8k/test.parquet"
+
+# S3 checkpoint configuration
+S3_CHECKPOINT_BASE=${S3_CHECKPOINT_BASE:-"s3://s3-bucket-example"}
+# Performance parameters
+gen_tp=2
+log_prob_micro_bsz_per_gpu=32
+gpu_memory_utilization=0.6
+
+# Memory optimization
+param_offload=False
+optimizer_offload=False
+ref_param_offload=True
+
+# Print configuration for verification
+echo "=== MTC GRPO Training Configuration ==="
+echo "Project: ${project_name}"
+echo "Experiment: ${exp_name}"
+echo "Model: ${MODEL_PATH}"
+echo "Nodes: ${NNODES}"
+echo "GPUs per node: ${GPUS_PER_NODE}"
+echo "Total GPUs: $((NNODES * GPUS_PER_NODE))"
+echo "Data home: ${RAY_DATA_HOME}"
+echo "S3 Checkpoints: ${S3_CHECKPOINT_BASE}"
+echo "Ray address: ${RAY_ADDRESS}"
+echo "=================================="
+
+# Submit Ray job
+ray job submit --no-wait \
+    --address "${RAY_ADDRESS}" \
+    --working-dir "${WORKING_DIR}" \
+    -- python3 -m verl.trainer.main_ppo \
+    algorithm.adv_estimator=${adv_estimator} \
+    data.train_files="${TRAIN_FILE}" \
+    data.val_files="${TEST_FILE}" \
+    data.prompt_key=question \
+    data.train_batch_size=${train_prompt_bsz} \
+    data.max_prompt_length=${max_prompt_length} \
+    data.max_response_length=${max_response_length} \
+    data.filter_overlong_prompts=${filter_overlong_prompts} \
+    data.truncation=${truncation} \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=${train_prompt_micro_bsz_per_gpu} \
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
+    actor_rollout_ref.actor.kl_loss_type=${kl_loss_type} \
+    actor_rollout_ref.actor.entropy_coeff=${entropy_coeff} \
+    actor_rollout_ref.actor.fsdp_config.param_offload=${param_offload} \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${optimizer_offload} \
+    actor_rollout_ref.actor.checkpoint.s3_base_path=${S3_CHECKPOINT_BASE} \
+    actor_rollout_ref.actor.checkpoint.ckpt_namespace=mtc-grpo-$(date +%s) \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=${log_prob_micro_bsz_per_gpu} \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.gpu_memory_utilization=${gpu_memory_utilization} \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=${log_prob_micro_bsz_per_gpu} \
+    actor_rollout_ref.ref.fsdp_config.param_offload=${ref_param_offload} \
+    algorithm.use_kl_in_reward=${use_kl_in_reward} \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console"]' \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.n_gpus_per_node=${GPUS_PER_NODE} \
+    trainer.nnodes=${NNODES} \
+    trainer.save_freq=1 \
+    trainer.test_freq=2 \
+    trainer.total_epochs=5 \
+    trainer.s3_base_path=${S3_CHECKPOINT_BASE}
+
+echo ""
+echo "Job submitted! Check status with: ray job status <job-id>"
+echo "Or view logs with: ray job logs <job-id> --follow"
diff --git a/3.test_cases/pytorch/verl/rlvr/setup/IRSA-README.md b/3.test_cases/pytorch/verl/rlvr/setup/IRSA-README.md
new file mode 100644
index 000000000..3b8ef276c
--- /dev/null
+++ b/3.test_cases/pytorch/verl/rlvr/setup/IRSA-README.md
@@ -0,0 +1,166 @@
+# IRSA Setup for Ray Pods S3 Access
+
+This guide explains how to set up IAM Roles for Service Accounts (IRSA) to give your Ray pods access to S3 for managed tiered checkpointing.
+
+## Prerequisites
+
+- `eksctl` installed (for OIDC provider setup)
+- AWS CLI configured with appropriate permissions
+- kubectl configured to access your EKS cluster
+
+## Quick Setup
+
+Run the automated setup script:
+
+```bash
+./setup/setup-irsa.sh
+```
+
+This script will:
+1. Check/create OIDC provider for your EKS cluster
+2. Create an IAM policy with S3 full access
+3. Create an IAM role with trust policy for the service account
+4. Attach the policy to the role
+5. Create a Kubernetes service account with the IAM role annotation
+
+## What Gets Created
+
+### IAM Policy
+- **Name**: `ray-s3-access-policy`
+- **Permissions**: Full S3 access (`s3:*`)
+
+### IAM Role
+- **Name**: `ray-s3-access-role`
+- **Trust Policy**: Allows the Kubernetes service account to assume this role via OIDC
+
+### Kubernetes Service Account
+- **Name**: `ray-s3-sa`
+- **Namespace**: `default`
+- **Annotation**: Links to the IAM role ARN
+
+## How It Works
+
+1. **OIDC Provider**: EKS cluster has an OIDC identity provider that allows Kubernetes service accounts to authenticate with AWS IAM
+2. **Service Account**: Ray pods use a Kubernetes service account annotated with an IAM role ARN
+3. **IAM Role**: The IAM role has a trust policy that allows the service account to assume it
+4. **Credentials**: AWS SDK automatically retrieves temporary credentials via the OIDC token
+
+## Verification
+
+After running the setup, verify the configuration:
+
+```bash
+# Check service account
+kubectl get sa ray-s3-sa -n default -o yaml
+
+# Check IAM role
+aws iam get-role --role-name ray-s3-access-role
+
+# Check policy attachment
+aws iam list-attached-role-policies --role-name ray-s3-access-role
+```
+
+## Using with RayCluster
+
+The RayCluster YAML has been updated to use this service account. Both head and worker pods will have:
+
+```yaml
+spec:
+  serviceAccountName: ray-s3-sa
+```
+
+This gives them automatic S3 access without needing to manage credentials.
+
+## Testing S3 Access
+
+Once the cluster is deployed, you can test S3 access from a pod:
+
+```bash
+# Get a pod name
+POD_NAME=$(kubectl get pods -l ray.io/node-type=head -o jsonpath='{.items[0].metadata.name}')
+
+# Test S3 access
+kubectl exec -it $POD_NAME -- aws s3 ls s3://sagemaker-mvincig-rlvr-e66849d3-bucket/
+```
+
+## Troubleshooting
+
+### OIDC Provider Not Found
+If you get an error about OIDC provider, install eksctl:
+```bash
+# macOS
+brew install eksctl
+
+# Linux
+curl --silent --location "https://github.com/weaveworks/eksctl/releases/latest/download/eksctl_$(uname -s)_amd64.tar.gz" | tar xz -C /tmp
+sudo mv /tmp/eksctl /usr/local/bin
+```
+
+### Credentials Not Working
+1. Check service account annotation:
+   ```bash
+   kubectl get sa ray-s3-sa -o yaml
+   ```
+2. Check pod has the service account:
+   ```bash
+   kubectl get pod <pod-name> -o yaml | grep serviceAccountName
+   ```
+3. Check environment variables in pod:
+   ```bash
+   kubectl exec -it <pod-name> -- env | grep AWS
+   ```
+
+### Permission Denied
+1. Verify IAM role has the policy attached
+2. Check the trust policy allows your service account
+3. Ensure the OIDC provider ARN matches in the trust policy
+
+## Security Considerations
+
+The current setup grants full S3 access (`s3:*`). For production, consider:
+
+1. **Restrict to specific buckets**:
+   ```json
+   "Resource": [
+       "arn:aws:s3:::sagemaker-mvincig-rlvr-e66849d3-bucket",
+       "arn:aws:s3:::sagemaker-mvincig-rlvr-e66849d3-bucket/*"
+   ]
+   ```
+
+2. **Limit actions**:
+   ```json
+   "Action": [
+       "s3:GetObject",
+       "s3:PutObject",
+       "s3:DeleteObject",
+       "s3:ListBucket"
+   ]
+   ```
+
+3. **Add conditions** for additional security
+
+## Cleanup
+
+To remove the IRSA setup:
+
+```bash
+# Delete service account
+kubectl delete sa ray-s3-sa -n default
+
+# Detach policy from role
+aws iam detach-role-policy \
+    --role-name ray-s3-access-role \
+    --policy-arn arn:aws:iam::$(aws sts get-caller-identity --query Account --output text):policy/ray-s3-access-policy
+
+# Delete IAM role
+aws iam delete-role --role-name ray-s3-access-role
+
+# Delete IAM policy
+aws iam delete-policy \
+    --policy-arn arn:aws:iam::$(aws sts get-caller-identity --query Account --output text):policy/ray-s3-access-policy
+```
+
+## References
+
+- [EKS IRSA Documentation](https://docs.aws.amazon.com/eks/latest/userguide/iam-roles-for-service-accounts.html)
+- [SageMaker HyperPod Managed Tiered Checkpointing](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-hyperpod-managed-tiered-checkpointing.html)
diff --git a/3.test_cases/pytorch/verl/rlvr/setup/env_vars.example b/3.test_cases/pytorch/verl/rlvr/setup/env_vars.example
index 8e2bc92ef..c6821c1df 100644
--- a/3.test_cases/pytorch/verl/rlvr/setup/env_vars.example
+++ b/3.test_cases/pytorch/verl/rlvr/setup/env_vars.example
@@ -23,6 +23,8 @@ export VERL_HOME="fsx/verl"
 export RAY_DASHBOARD_PORT=8265 # Local port for Ray dashboard (forwarded from cluster)
 export RAY_ADDRESS="http://localhost:${RAY_DASHBOARD_PORT}"
 export WORKING_DIR="$(pwd)/verl"
+export RAY_NAMESPACE="default"  # Namespace where RayCluster runs
+
 
 # Job Env Vars (using NUM_NODES for consistency)
 export HF_TOKEN=<your-huggingface-token>
@@ -31,16 +33,11 @@ export NCCL_DEBUG=INFO
 # Memory optimization settings
 export RAY_memory_usage_threshold=0.85
 
-# Add these to reduce I/O pressure
-export TRAIN_BATCH_SIZE=32    # Reduced from 512
-export GEN_BATCH_SIZE=384      # Reduced from 1536
-export N_RESP_PER_PROMPT=2     # Reduced from 16
+# Training parameters
+export TRAIN_BATCH_SIZE=32    
+export GEN_BATCH_SIZE=384      
+export N_RESP_PER_PROMPT=2     
 
-# Observability with HyperPod - Amazon Managed Prometheus & Grafana
-export AMP_WORKSPACE_ID="ws-xxxxxxxxxxxxxxxxx"
-export AMP_ENDPOINT="https://aps-workspaces.${AWS_REGION}.amazonaws.com/workspaces/${AMP_WORKSPACE_ID}"
-export GRAFANA_WORKSPACE_ID="g-xxxxxxxxxxxxxxxxx"
-export GRAFANA_ENDPOINT="https://${GRAFANA_WORKSPACE_ID}.grafana-workspace.${AWS_REGION}.amazonaws.com"
-export CLUSTER_ID="<your-cluster-id>"
-export CLUSTER_NAME="<your-cluster-name>"
-export RAY_NAMESPACE="default"  # Namespace where RayCluster runs
+# Managed Tiered Checkpointing with HyperPod
+# S3 checkpoint configuration
+S3_CHECKPOINT_BASE="s3://<your S3 bucket name>/checkpoints"
diff --git a/3.test_cases/pytorch/verl/rlvr/setup/setup-irsa.sh b/3.test_cases/pytorch/verl/rlvr/setup/setup-irsa.sh
new file mode 100644
index 000000000..0fed69afb
--- /dev/null
+++ b/3.test_cases/pytorch/verl/rlvr/setup/setup-irsa.sh
@@ -0,0 +1,165 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Load environment variables
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "${SCRIPT_DIR}/env_vars"
+
+# Configuration
+EKS_CLUSTER_NAME="${EKS_CLUSTER_NAME}"
+AWS_REGION="${AWS_REGION}"
+AWS_ACCOUNT="${ACCOUNT}"
+NAMESPACE="default"
+SERVICE_ACCOUNT_NAME="ray-s3-sa"
+IAM_ROLE_NAME="ray-s3-access-role"
+IAM_POLICY_NAME="ray-s3-access-policy"
+
+echo "=== Setting up IRSA for Ray Pods ==="
+echo "EKS Cluster: ${EKS_CLUSTER_NAME}"
+echo "Region: ${AWS_REGION}"
+echo "Account: ${AWS_ACCOUNT}"
+echo "Namespace: ${NAMESPACE}"
+echo "Service Account: ${SERVICE_ACCOUNT_NAME}"
+echo ""
+
+# Step 1: Check if OIDC provider exists for the cluster
+echo "Step 1: Checking OIDC provider..."
+OIDC_ID=$(aws eks describe-cluster --name ${EKS_CLUSTER_NAME} --region ${AWS_REGION} --query "cluster.identity.oidc.issuer" --output text | cut -d '/' -f 5)
+
+if [ -z "$OIDC_ID" ]; then
+    echo "ERROR: Could not get OIDC ID from cluster"
+    exit 1
+fi
+
+echo "OIDC ID: ${OIDC_ID}"
+
+# Check if OIDC provider exists in IAM
+OIDC_PROVIDER_ARN="arn:aws:iam::${AWS_ACCOUNT}:oidc-provider/oidc.eks.${AWS_REGION}.amazonaws.com/id/${OIDC_ID}"
+if aws iam get-open-id-connect-provider --open-id-connect-provider-arn ${OIDC_PROVIDER_ARN} 2>/dev/null; then
+    echo "✓ OIDC provider already exists"
+else
+    echo "Creating OIDC provider..."
+    eksctl utils associate-iam-oidc-provider --cluster=${EKS_CLUSTER_NAME} --region=${AWS_REGION} --approve
+    echo "✓ OIDC provider created"
+fi
+
+echo ""
+
+# Step 2: Create IAM policy for S3 access
+echo "Step 2: Creating IAM policy for S3 access..."
+
+# Check if policy already exists
+POLICY_ARN="arn:aws:iam::${AWS_ACCOUNT}:policy/${IAM_POLICY_NAME}"
+if aws iam get-policy --policy-arn ${POLICY_ARN} 2>/dev/null; then
+    echo "✓ IAM policy already exists: ${POLICY_ARN}"
+else
+    cat > /tmp/ray-s3-policy.json <<EOF
+{
+    "Version": "2012-10-17",
+    "Statement": [
+        {
+            "Effect": "Allow",
+            "Action": [
+                "s3:*"
+            ],
+            "Resource": "*"
+        }
+    ]
+}
+EOF
+
+    aws iam create-policy \
+        --policy-name ${IAM_POLICY_NAME} \
+        --policy-document file:///tmp/ray-s3-policy.json \
+        --description "Full S3 access for Ray pods"
+    
+    echo "✓ IAM policy created: ${POLICY_ARN}"
+    rm /tmp/ray-s3-policy.json
+fi
+
+echo ""
+
+# Step 3: Create IAM role with trust policy for the service account
+echo "Step 3: Creating IAM role with OIDC trust policy..."
+
+# Check if role already exists
+if aws iam get-role --role-name ${IAM_ROLE_NAME} 2>/dev/null; then
+    echo "✓ IAM role already exists: ${IAM_ROLE_NAME}"
+else
+    cat > /tmp/trust-policy.json <<EOF
+{
+    "Version": "2012-10-17",
+    "Statement": [
+        {
+            "Effect": "Allow",
+            "Principal": {
+                "Federated": "${OIDC_PROVIDER_ARN}"
+            },
+            "Action": "sts:AssumeRoleWithWebIdentity",
+            "Condition": {
+                "StringEquals": {
+                    "oidc.eks.${AWS_REGION}.amazonaws.com/id/${OIDC_ID}:sub": "system:serviceaccount:${NAMESPACE}:${SERVICE_ACCOUNT_NAME}",
+                    "oidc.eks.${AWS_REGION}.amazonaws.com/id/${OIDC_ID}:aud": "sts.amazonaws.com"
+                }
+            }
+        }
+    ]
+}
+EOF
+
+    aws iam create-role \
+        --role-name ${IAM_ROLE_NAME} \
+        --assume-role-policy-document file:///tmp/trust-policy.json \
+        --description "IAM role for Ray pods to access S3"
+    
+    echo "✓ IAM role created: ${IAM_ROLE_NAME}"
+    rm /tmp/trust-policy.json
+fi
+
+echo ""
+
+# Step 4: Attach policy to role
+echo "Step 4: Attaching policy to role..."
+aws iam attach-role-policy \
+    --role-name ${IAM_ROLE_NAME} \
+    --policy-arn ${POLICY_ARN}
+
+echo "✓ Policy attached to role"
+echo ""
+
+# Step 5: Create Kubernetes service account
+echo "Step 5: Creating Kubernetes service account..."
+
+ROLE_ARN="arn:aws:iam::${AWS_ACCOUNT}:role/${IAM_ROLE_NAME}"
+
+cat > /tmp/ray-service-account.yaml <<EOF
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: ${SERVICE_ACCOUNT_NAME}
+  namespace: ${NAMESPACE}
+  annotations:
+    eks.amazonaws.com/role-arn: ${ROLE_ARN}
+EOF
+
+kubectl apply -f /tmp/ray-service-account.yaml
+echo "✓ Kubernetes service account created"
+rm /tmp/ray-service-account.yaml
+
+echo ""
+echo "=== IRSA Setup Complete ==="
+echo ""
+echo "Service Account: ${SERVICE_ACCOUNT_NAME}"
+echo "IAM Role ARN: ${ROLE_ARN}"
+echo ""
+echo "Next steps:"
+echo "1. Update your RayCluster YAML to use this service account"
+echo "2. Add 'serviceAccountName: ${SERVICE_ACCOUNT_NAME}' to the pod spec"
+echo ""
+echo "Example:"
+echo "  spec:"
+echo "    serviceAccountName: ${SERVICE_ACCOUNT_NAME}"
+echo "    containers:"
+echo "    - name: ray-head"
+echo "      ..."
+echo ""