aws-samples · bluecrayon52 · Nov 23, 2025 · Nov 23, 2025
diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/README.md b/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/README.md
@@ -84,13 +84,13 @@ As a prerequisite, you will need to identify or create input and output S3 bucke
 
 To create new S3 buckets, you can execute commands like the following example using the AWS CLI: 
 ```bash
-aws s3 mb s3://my-tf-rig-test-input-bucket --region us-east-1
+aws s3 mb s3://my-tf-rig-test-input-bucket --region us-east-1 # adjust region as needed
 
-aws s3 mb s3://my-tf-rig-test-output-bucket --region us-east-1
+aws s3 mb s3://my-tf-rig-test-output-bucket --region us-east-1 # adjust region as needed
 ```
 S3 bucket names must be globally unique. 
 
-You will also need to have [yq](https://pypi.org/project/yq/) installed so that a bash script that modifies CoreDNS and VPC NCI deployments can execute properly. 
+You will also need to have [yq](https://pypi.org/project/yq/) installed so that a bash script that modifies CoreDNS and VPC CNI deployments can execute properly. 
 
 For Nova model customization using Restricted Instance Groups (RIG), you can use the example configuration in [`rig_custom.tfvars`](./hyperpod-eks-tf/rig_custom.tfvars). This file demonstrates how to configure restricted instance groups with the necessary S3 buckets and instance specifications.
 
@@ -106,7 +106,6 @@ aws_region = "us-east-1"
 availability_zone_id  = "use1-az6"
 rig_input_s3_bucket = "my-tf-rig-test-input-bucket"
 rig_output_s3_bucket = "my-tf-rig-test-output-bucket"
-instance_groups = {}
 restricted_instance_groups = {
    rig-1 = {
         instance_type = "ml.p5.48xlarge",
@@ -122,10 +121,14 @@ restricted_instance_groups = {
 EOL
 ```
 RIG mode (`local.rig_mode = true` set in [main.tf](./hyperpod-eks-tf/main.tf)) is automatic when `restricted_instance_groups` are defined, enabling Nova model customization with the following changes: 
-- VPC Endpoints: Lambda and SQS interface endpoints are added for reinforcement fine-tuning (RFT) with integrations for your custom reward service hosted outside of the RIG. These endpoints are enabled in RIG mode by default so that you can easily transition from continuous pre-training (CPT) or supervised fine-tuning (SFT) to RFT without making infrastructure changes, but they can be disabled by setting `rig_rft_lambda_access` and `rig_rft_sqs_access` to false. 
-- IAM Execution Role Permissions: The execution role associated wit the HyperPod nodes is expanded to include read permission to your input S3 bucket and write permissions to your output S3 bucket. Access to SQS and Lambda resources with ARN patterns `arn:aws:lambda:*:*:function:*SageMaker*` and `arn:aws:sqs:*:*:*SageMaker*` are also conditionally added if `rig_rft_lambda_access` and `rig_rft_sqs_access` are true (default). 
-- Helm Charts: A specific Helm revision is checked out and used for RIG support. After Helm chart instillation, a bash script is used to modify CoreDNS and VPC NCI deployments (be sure to have [yq](https://pypi.org/project/yq/) installed for this). 
-- HyperPod Cluster: Continuous provisioning mode and Karpenter autoscaling are disabled automatically. 
+- **VPC Endpoints**: Lambda and SQS interface endpoints are added for reinforcement fine-tuning (RFT) with integrations for your custom reward service hosted outside of the RIG. These endpoints are enabled in RIG mode by default so that you can easily transition from continuous pre-training (CPT) or supervised fine-tuning (SFT) to RFT without making infrastructure changes, but they can be disabled by setting `rig_rft_lambda_access` and `rig_rft_sqs_access` to false. 
+- **IAM Execution Role Permissions**: The execution role associated with the HyperPod nodes is expanded to include read permission to your input S3 bucket and write permissions to your output S3 bucket. Access to SQS and Lambda resources with ARN patterns `arn:aws:lambda:*:*:function:*SageMaker*` and `arn:aws:sqs:*:*:*SageMaker*` are also conditionally added if `rig_rft_lambda_access` and `rig_rft_sqs_access` are true (default). 
+- **Helm Charts**: A specific Helm revision is checked out and used for RIG support. After Helm chart instillation, a bash script is used to modify CoreDNS and VPC NCI deployments (be sure to have [yq](https://pypi.org/project/yq/) installed for this). 
+- **HyperPod Cluster**: Continuous provisioning mode and Karpenter autoscaling are disabled automatically for RIG compatibility. Deploying a HyperPod cluster with a combination of standard instance groups and RIGs is also not currently supported, so `instance_groups` definitions are ignored when `restricted_instance_groups` are defined.
+- **FSx for Lustre**: For RIGs a service managed FSx for Lustre filesystem is created based on the specifications you provide in `fsxl_per_unit_storage_throughput` and `fsxl_size_in_gi_b`. 
+    - Valid values for `fsxl_per_unit_storage_throughput` are 125, 250, 500, or 1000 MBps/TiB. 
+    - Valid values for `fsxl_size_in_gi_b` start at 1200 GiB and go up in increments of 2400 GiB. 
+- **S3 Lifecycle Scripts**: Because RIGs do not leverage lifecycle scripts, the `s3_bucket` and `lifecycle_script` modules are also disabled in RIG mode. 
 
 Please note that the following addons are NOT currently supported on HyperPod with RIGs: 
 - HyperPod Task Governance 
@@ -135,8 +138,6 @@ Please note that the following addons are NOT currently supported on HyperPod wi
 
 Do not attempt to install these addons later using the console. 
 
-Deploying a HyperPod cluster with a combination of standard instance groups and RIGs is also not currently supported, so be sure to specify `instance_groups = {}` in your configuration. 
-
 Once you have your `rig_custom.tfvars` file is created, you can proceed to deployment. 
 
 ---

diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/main.tf b/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/main.tf
@@ -9,18 +9,19 @@ data "aws_s3_bucket" "existing_s3_bucket" {
 }
 
 locals {
+  rig_mode                 = length(var.restricted_instance_groups) > 0
   vpc_id                   = var.create_vpc_module ? module.vpc[0].vpc_id : var.existing_vpc_id
   private_subnet_id        = var.create_private_subnet_module ? module.private_subnet[0].private_subnet_id : var.existing_private_subnet_id
   security_group_id        = var.create_security_group_module ? module.security_group[0].security_group_id : var.existing_security_group_id
-  s3_bucket_name           = var.create_s3_bucket_module ? module.s3_bucket[0].s3_bucket_name : var.existing_s3_bucket_name
+  s3_bucket_name           = !local.rig_mode ? (var.create_s3_bucket_module ? module.s3_bucket[0].s3_bucket_name : var.existing_s3_bucket_name) : null
   eks_cluster_name         = var.create_eks_module ? module.eks_cluster[0].eks_cluster_name : var.existing_eks_cluster_name
   sagemaker_iam_role_name  = var.create_sagemaker_iam_role_module ? module.sagemaker_iam_role[0].sagemaker_iam_role_name : var.existing_sagemaker_iam_role_name
   deploy_hyperpod          = var.create_hyperpod_module && !(var.create_eks_module && !var.create_helm_chart_module)
-  rig_mode                 = length(var.restricted_instance_groups) > 0
   karpenter_role_arn       = var.create_sagemaker_iam_role_module && length(module.sagemaker_iam_role[0].karpenter_role_arn) > 0 ? module.sagemaker_iam_role[0].karpenter_role_arn[0] : null
   nat_gateway_id           = var.create_vpc_module ? module.vpc[0].nat_gateway_1_id : var.existing_nat_gateway_id
   private_route_table_id   = var.create_private_subnet_module ? module.private_subnet[0].private_route_table_id : var.existing_private_route_table_id
   eks_private_subnet_cidrs = [var.eks_private_subnet_1_cidr, var.eks_private_subnet_2_cidr]
+  instance_groups          = !local.rig_mode ? var.instance_groups : {}
 }
 
 module "vpc" {
@@ -69,7 +70,7 @@ module "eks_cluster" {
 }
 
 module "s3_bucket" {
-  count  = var.create_s3_bucket_module ? 1 : 0
+  count  = !local.rig_mode && var.create_s3_bucket_module ? 1 : 0
   source = "./modules/s3_bucket"
 
   resource_name_prefix = var.resource_name_prefix
@@ -95,7 +96,7 @@ module "vpc_endpoints" {
 }
 
 module "lifecycle_script" {
-  count  = var.create_lifecycle_script_module ? 1 : 0
+  count  = !local.rig_mode && var.create_lifecycle_script_module ? 1 : 0
   source = "./modules/lifecycle_script"
 
   resource_name_prefix = var.resource_name_prefix
@@ -139,6 +140,7 @@ module "helm_chart" {
   enable_kubeflow_training_operators  = var.enable_kubeflow_training_operators 
   enable_cluster_role_and_bindings    = var.enable_cluster_role_and_bindings
   enable_namespaced_role_and_bindings = var.enable_namespaced_role_and_bindings
+  enable_team_role_and_bindings       = var.enable_team_role_and_bindings
   enable_nvidia_device_plugin         = var.enable_nvidia_device_plugin
   enable_neuron_device_plugin         = var.enable_neuron_device_plugin
   enable_mpi_operator                 = var.enable_mpi_operator
@@ -166,7 +168,7 @@ module "hyperpod_cluster" {
   resource_name_prefix         = var.resource_name_prefix
   hyperpod_cluster_name        = var.hyperpod_cluster_name
   auto_node_recovery           = var.auto_node_recovery
-  instance_groups              = var.instance_groups
+  instance_groups              = local.instance_groups
   restricted_instance_groups   = var.restricted_instance_groups
   private_subnet_id            = local.private_subnet_id
   security_group_id            = local.security_group_id

diff --git a/...res/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/helm_chart/main.tf b/...res/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/helm_chart/main.tf
@@ -92,6 +92,10 @@ resource "helm_release" "hyperpod" {
       name = "namespaced-role-and-bindings.enable"
       value = var.enable_namespaced_role_and_bindings
     },
+    {
+      name = "team-role-and-bindings.enabled"
+      value = var.enable_team_role_and_bindings
+    },
     {
       name  = "gpu-operator.enabled"
       value = var.enable_gpu_operator

diff --git a/....sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/helm_chart/variables.tf b/....sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/helm_chart/variables.tf
@@ -60,11 +60,17 @@ variable "enable_cluster_role_and_bindings" {
   description = "Whether to enable the cluster role and bindings"
   type        = bool
 }
+
 variable "enable_namespaced_role_and_bindings" {
   description = "Whether to enable the namespaced role and bindings"
   type        = bool
 }
 
+variable "enable_team_role_and_bindings" {
+  description = "Whether to enable the team role and binding"
+  type        = bool
+}
+
 variable "enable_nvidia_device_plugin" {
   description = "Whether to enable the NVIDIA device plugin"
   type        = bool

diff --git a/...sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/hyperpod_cluster/main.tf b/...sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/hyperpod_cluster/main.tf
@@ -87,7 +87,7 @@ resource "awscc_sagemaker_cluster" "hyperpod_cluster" {
 
   restricted_instance_groups = length(local.restricted_instance_groups_list) > 0 ? local.restricted_instance_groups_list : null
 
-  node_provisioning_mode = var.rig_mode ? null : var.continuous_provisioning_mode ? "Continuous" : null
+  node_provisioning_mode = !var.rig_mode && var.continuous_provisioning_mode ? "Continuous" : null
 
   node_recovery = var.auto_node_recovery ? "Automatic" : null
 

diff --git a/...gemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/sagemaker_iam_role/main.tf b/...gemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/sagemaker_iam_role/main.tf
@@ -86,7 +86,7 @@ resource "aws_iam_policy" "sagemaker_execution_policy" {
 
   policy = jsonencode({
     Version = "2012-10-17"
-    Statement = [
+    Statement = concat([
       {
         Effect = "Allow"
         Action = [
@@ -132,7 +132,9 @@ resource "aws_iam_policy" "sagemaker_execution_policy" {
           "eks-auth:AssumeRoleForPodIdentity"
         ]
         Resource = local.eks_cluster_arn
-      },
+      }
+    ],
+    !var.rig_mode ? [
       {
         Effect = "Allow"
         Action = [
@@ -144,7 +146,7 @@ resource "aws_iam_policy" "sagemaker_execution_policy" {
           "arn:aws:s3:::${var.s3_bucket_name}/*"
         ]
       }
-    ]
+    ] : [])
   })
 
   tags = var.tags

diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/outputs.tf b/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/outputs.tf
@@ -66,12 +66,12 @@ output "eks_cluster_certificate_authority" {
 # S3 Bucket Outputs
 output "s3_bucket_name" {
   description = "Name of the S3 bucket"
-  value       = var.create_s3_bucket_module ? module.s3_bucket[0].s3_bucket_name : var.existing_s3_bucket_name
+  value       = !local.rig_mode ? (var.create_s3_bucket_module ? module.s3_bucket[0].s3_bucket_name : var.existing_s3_bucket_name) : null
 }
 
 output "s3_bucket_arn" {
   description = "ARN of the S3 bucket"
-  value       = var.create_s3_bucket_module ? module.s3_bucket[0].s3_bucket_arn : (var.existing_s3_bucket_name != "" ? data.aws_s3_bucket.existing_s3_bucket[0].arn : null)
+  value = !local.rig_mode ? (var.create_s3_bucket_module ? module.s3_bucket[0].s3_bucket_arn : (var.existing_s3_bucket_name != "" ? data.aws_s3_bucket.existing_s3_bucket[0].arn : null)) : null
 }
 
 # S3 VPC Endpoints Outputs

diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/rig_custom.tfvars b/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/rig_custom.tfvars
@@ -6,7 +6,6 @@ aws_region = "us-east-1"
 availability_zone_id  = "use1-az6"
 rig_input_s3_bucket = "my-tf-rig-test-input-bucket"
 rig_output_s3_bucket = "my-tf-rig-test-output-bucket"
-instance_groups = {}
 restricted_instance_groups = {
    rig-1 = {
         instance_type = "ml.p5.48xlarge",

diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/variables.tf b/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/variables.tf
@@ -244,7 +244,7 @@ variable "enable_gpu_operator" {
 variable "enable_mlflow" {
   description = "Whether to enable the MLFlow"
   type        = bool
-  default     = true
+  default     = false
 }
 
 variable "enable_kubeflow_training_operators" {
@@ -253,15 +253,23 @@ variable "enable_kubeflow_training_operators" {
   default     = true
 }
 
+# task governance helm chart values
 variable "enable_cluster_role_and_bindings" {
   description = "Whether to enable the cluster role and bindings"
   type        = bool
-  default     = true
+  default     = false
 }
+
 variable "enable_namespaced_role_and_bindings" {
   description = "Whether to enable the namespaced role and bindings"
   type        = bool
-  default     = true
+  default     = false
+}
+
+variable "enable_team_role_and_bindings" {
+  description = "Whether to enable the team role and binding"
+  type        = bool
+  default     = false
 }
 
 variable "enable_nvidia_device_plugin" {