From 168cba6c9c60d551b481602c22c2f9330ea91001 Mon Sep 17 00:00:00 2001 From: bluecrayon52 <16687465+bluecrayon52@users.noreply.github.com> Date: Sun, 23 Nov 2025 13:59:41 -0600 Subject: [PATCH 1/2] auto-disabled igs and lcs for rig mode for better UX --- .../terraform-modules/README.md | 21 ++++++++++--------- .../terraform-modules/hyperpod-eks-tf/main.tf | 13 +++++++----- .../modules/helm_chart/main.tf | 4 ++++ .../modules/helm_chart/variables.tf | 6 ++++++ .../modules/hyperpod_cluster/main.tf | 2 +- .../modules/sagemaker_iam_role/main.tf | 8 ++++--- .../hyperpod-eks-tf/rig_custom.tfvars | 1 - .../hyperpod-eks-tf/variables.tf | 14 ++++++++++--- 8 files changed, 46 insertions(+), 23 deletions(-) diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/README.md b/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/README.md index 871aa5546..e513cc26f 100644 --- a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/README.md +++ b/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/README.md @@ -84,13 +84,13 @@ As a prerequisite, you will need to identify or create input and output S3 bucke To create new S3 buckets, you can execute commands like the following example using the AWS CLI: ```bash -aws s3 mb s3://my-tf-rig-test-input-bucket --region us-east-1 +aws s3 mb s3://my-tf-rig-test-input-bucket --region us-east-1 # adjust region as needed -aws s3 mb s3://my-tf-rig-test-output-bucket --region us-east-1 +aws s3 mb s3://my-tf-rig-test-output-bucket --region us-east-1 # adjust region as needed ``` S3 bucket names must be globally unique. -You will also need to have [yq](https://pypi.org/project/yq/) installed so that a bash script that modifies CoreDNS and VPC NCI deployments can execute properly. +You will also need to have [yq](https://pypi.org/project/yq/) installed so that a bash script that modifies CoreDNS and VPC CNI deployments can execute properly. For Nova model customization using Restricted Instance Groups (RIG), you can use the example configuration in [`rig_custom.tfvars`](./hyperpod-eks-tf/rig_custom.tfvars). This file demonstrates how to configure restricted instance groups with the necessary S3 buckets and instance specifications. @@ -106,7 +106,6 @@ aws_region = "us-east-1" availability_zone_id = "use1-az6" rig_input_s3_bucket = "my-tf-rig-test-input-bucket" rig_output_s3_bucket = "my-tf-rig-test-output-bucket" -instance_groups = {} restricted_instance_groups = { rig-1 = { instance_type = "ml.p5.48xlarge", @@ -122,10 +121,14 @@ restricted_instance_groups = { EOL ``` RIG mode (`local.rig_mode = true` set in [main.tf](./hyperpod-eks-tf/main.tf)) is automatic when `restricted_instance_groups` are defined, enabling Nova model customization with the following changes: -- VPC Endpoints: Lambda and SQS interface endpoints are added for reinforcement fine-tuning (RFT) with integrations for your custom reward service hosted outside of the RIG. These endpoints are enabled in RIG mode by default so that you can easily transition from continuous pre-training (CPT) or supervised fine-tuning (SFT) to RFT without making infrastructure changes, but they can be disabled by setting `rig_rft_lambda_access` and `rig_rft_sqs_access` to false. -- IAM Execution Role Permissions: The execution role associated wit the HyperPod nodes is expanded to include read permission to your input S3 bucket and write permissions to your output S3 bucket. Access to SQS and Lambda resources with ARN patterns `arn:aws:lambda:*:*:function:*SageMaker*` and `arn:aws:sqs:*:*:*SageMaker*` are also conditionally added if `rig_rft_lambda_access` and `rig_rft_sqs_access` are true (default). -- Helm Charts: A specific Helm revision is checked out and used for RIG support. After Helm chart instillation, a bash script is used to modify CoreDNS and VPC NCI deployments (be sure to have [yq](https://pypi.org/project/yq/) installed for this). -- HyperPod Cluster: Continuous provisioning mode and Karpenter autoscaling are disabled automatically. +- **VPC Endpoints**: Lambda and SQS interface endpoints are added for reinforcement fine-tuning (RFT) with integrations for your custom reward service hosted outside of the RIG. These endpoints are enabled in RIG mode by default so that you can easily transition from continuous pre-training (CPT) or supervised fine-tuning (SFT) to RFT without making infrastructure changes, but they can be disabled by setting `rig_rft_lambda_access` and `rig_rft_sqs_access` to false. +- **IAM Execution Role Permissions**: The execution role associated with the HyperPod nodes is expanded to include read permission to your input S3 bucket and write permissions to your output S3 bucket. Access to SQS and Lambda resources with ARN patterns `arn:aws:lambda:*:*:function:*SageMaker*` and `arn:aws:sqs:*:*:*SageMaker*` are also conditionally added if `rig_rft_lambda_access` and `rig_rft_sqs_access` are true (default). +- **Helm Charts**: A specific Helm revision is checked out and used for RIG support. After Helm chart instillation, a bash script is used to modify CoreDNS and VPC NCI deployments (be sure to have [yq](https://pypi.org/project/yq/) installed for this). +- **HyperPod Cluster**: Continuous provisioning mode and Karpenter autoscaling are disabled automatically for RIG compatibility. Deploying a HyperPod cluster with a combination of standard instance groups and RIGs is also not currently supported, so `instance_groups` definitions are ignored when `restricted_instance_groups` are defined. +- **FSx for Lustre**: For RIGs a service managed FSx for Lustre filesystem is created based on the specifications you provide in `fsxl_per_unit_storage_throughput` and `fsxl_size_in_gi_b`. + - Valid values for `fsxl_per_unit_storage_throughput` are 125, 250, 500, or 1000 MBps/TiB. + - Valid values for `fsxl_size_in_gi_b` start at 1200 GiB and go up in increments of 2400 GiB. +- **S3 Lifecycle Scripts**: Because RIGs do not leverage lifecycle scripts, the `s3_bucket` and `lifecycle_script` modules are also disabled in RIG mode. Please note that the following addons are NOT currently supported on HyperPod with RIGs: - HyperPod Task Governance @@ -135,8 +138,6 @@ Please note that the following addons are NOT currently supported on HyperPod wi Do not attempt to install these addons later using the console. -Deploying a HyperPod cluster with a combination of standard instance groups and RIGs is also not currently supported, so be sure to specify `instance_groups = {}` in your configuration. - Once you have your `rig_custom.tfvars` file is created, you can proceed to deployment. --- diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/main.tf b/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/main.tf index f210c15e6..7cff18f19 100644 --- a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/main.tf +++ b/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/main.tf @@ -9,18 +9,20 @@ data "aws_s3_bucket" "existing_s3_bucket" { } locals { + rig_mode = length(var.restricted_instance_groups) > 0 vpc_id = var.create_vpc_module ? module.vpc[0].vpc_id : var.existing_vpc_id private_subnet_id = var.create_private_subnet_module ? module.private_subnet[0].private_subnet_id : var.existing_private_subnet_id security_group_id = var.create_security_group_module ? module.security_group[0].security_group_id : var.existing_security_group_id - s3_bucket_name = var.create_s3_bucket_module ? module.s3_bucket[0].s3_bucket_name : var.existing_s3_bucket_name + which_s3_bucket_name = var.create_s3_bucket_module ? module.s3_bucket[0].s3_bucket_name : var.existing_s3_bucket_name + s3_bucket_name = !local.rig_mode ? local.which_s3_bucket_name : null eks_cluster_name = var.create_eks_module ? module.eks_cluster[0].eks_cluster_name : var.existing_eks_cluster_name sagemaker_iam_role_name = var.create_sagemaker_iam_role_module ? module.sagemaker_iam_role[0].sagemaker_iam_role_name : var.existing_sagemaker_iam_role_name deploy_hyperpod = var.create_hyperpod_module && !(var.create_eks_module && !var.create_helm_chart_module) - rig_mode = length(var.restricted_instance_groups) > 0 karpenter_role_arn = var.create_sagemaker_iam_role_module && length(module.sagemaker_iam_role[0].karpenter_role_arn) > 0 ? module.sagemaker_iam_role[0].karpenter_role_arn[0] : null nat_gateway_id = var.create_vpc_module ? module.vpc[0].nat_gateway_1_id : var.existing_nat_gateway_id private_route_table_id = var.create_private_subnet_module ? module.private_subnet[0].private_route_table_id : var.existing_private_route_table_id eks_private_subnet_cidrs = [var.eks_private_subnet_1_cidr, var.eks_private_subnet_2_cidr] + instance_groups = !local.rig_mode ? var.instance_groups : {} } module "vpc" { @@ -69,7 +71,7 @@ module "eks_cluster" { } module "s3_bucket" { - count = var.create_s3_bucket_module ? 1 : 0 + count = !local.rig_mode && var.create_s3_bucket_module ? 1 : 0 source = "./modules/s3_bucket" resource_name_prefix = var.resource_name_prefix @@ -95,7 +97,7 @@ module "vpc_endpoints" { } module "lifecycle_script" { - count = var.create_lifecycle_script_module ? 1 : 0 + count = !local.rig_mode && var.create_lifecycle_script_module ? 1 : 0 source = "./modules/lifecycle_script" resource_name_prefix = var.resource_name_prefix @@ -139,6 +141,7 @@ module "helm_chart" { enable_kubeflow_training_operators = var.enable_kubeflow_training_operators enable_cluster_role_and_bindings = var.enable_cluster_role_and_bindings enable_namespaced_role_and_bindings = var.enable_namespaced_role_and_bindings + enable_team_role_and_bindings = var.enable_team_role_and_bindings enable_nvidia_device_plugin = var.enable_nvidia_device_plugin enable_neuron_device_plugin = var.enable_neuron_device_plugin enable_mpi_operator = var.enable_mpi_operator @@ -166,7 +169,7 @@ module "hyperpod_cluster" { resource_name_prefix = var.resource_name_prefix hyperpod_cluster_name = var.hyperpod_cluster_name auto_node_recovery = var.auto_node_recovery - instance_groups = var.instance_groups + instance_groups = local.instance_groups restricted_instance_groups = var.restricted_instance_groups private_subnet_id = local.private_subnet_id security_group_id = local.security_group_id diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/helm_chart/main.tf b/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/helm_chart/main.tf index 27c582411..e2a4459c9 100644 --- a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/helm_chart/main.tf +++ b/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/helm_chart/main.tf @@ -92,6 +92,10 @@ resource "helm_release" "hyperpod" { name = "namespaced-role-and-bindings.enable" value = var.enable_namespaced_role_and_bindings }, + { + name = "team-role-and-bindings.enabled" + value = var.enable_team_role_and_bindings + }, { name = "gpu-operator.enabled" value = var.enable_gpu_operator diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/helm_chart/variables.tf b/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/helm_chart/variables.tf index 4ef7d9a94..0c9aa13c9 100644 --- a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/helm_chart/variables.tf +++ b/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/helm_chart/variables.tf @@ -60,11 +60,17 @@ variable "enable_cluster_role_and_bindings" { description = "Whether to enable the cluster role and bindings" type = bool } + variable "enable_namespaced_role_and_bindings" { description = "Whether to enable the namespaced role and bindings" type = bool } +variable "enable_team_role_and_bindings" { + description = "Whether to enable the team role and binding" + type = bool +} + variable "enable_nvidia_device_plugin" { description = "Whether to enable the NVIDIA device plugin" type = bool diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/hyperpod_cluster/main.tf b/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/hyperpod_cluster/main.tf index 24b6cbae2..07699d2a3 100644 --- a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/hyperpod_cluster/main.tf +++ b/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/hyperpod_cluster/main.tf @@ -87,7 +87,7 @@ resource "awscc_sagemaker_cluster" "hyperpod_cluster" { restricted_instance_groups = length(local.restricted_instance_groups_list) > 0 ? local.restricted_instance_groups_list : null - node_provisioning_mode = var.rig_mode ? null : var.continuous_provisioning_mode ? "Continuous" : null + node_provisioning_mode = !var.rig_mode && var.continuous_provisioning_mode ? "Continuous" : null node_recovery = var.auto_node_recovery ? "Automatic" : null diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/sagemaker_iam_role/main.tf b/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/sagemaker_iam_role/main.tf index 789595f35..f5bccf5ca 100644 --- a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/sagemaker_iam_role/main.tf +++ b/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/modules/sagemaker_iam_role/main.tf @@ -86,7 +86,7 @@ resource "aws_iam_policy" "sagemaker_execution_policy" { policy = jsonencode({ Version = "2012-10-17" - Statement = [ + Statement = concat([ { Effect = "Allow" Action = [ @@ -132,7 +132,9 @@ resource "aws_iam_policy" "sagemaker_execution_policy" { "eks-auth:AssumeRoleForPodIdentity" ] Resource = local.eks_cluster_arn - }, + } + ], + !var.rig_mode ? [ { Effect = "Allow" Action = [ @@ -144,7 +146,7 @@ resource "aws_iam_policy" "sagemaker_execution_policy" { "arn:aws:s3:::${var.s3_bucket_name}/*" ] } - ] + ] : []) }) tags = var.tags diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/rig_custom.tfvars b/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/rig_custom.tfvars index 63ab67d95..91009e363 100644 --- a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/rig_custom.tfvars +++ b/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/rig_custom.tfvars @@ -6,7 +6,6 @@ aws_region = "us-east-1" availability_zone_id = "use1-az6" rig_input_s3_bucket = "my-tf-rig-test-input-bucket" rig_output_s3_bucket = "my-tf-rig-test-output-bucket" -instance_groups = {} restricted_instance_groups = { rig-1 = { instance_type = "ml.p5.48xlarge", diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/variables.tf b/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/variables.tf index 37e053ffe..78dc776e0 100644 --- a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/variables.tf +++ b/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/variables.tf @@ -244,7 +244,7 @@ variable "enable_gpu_operator" { variable "enable_mlflow" { description = "Whether to enable the MLFlow" type = bool - default = true + default = false } variable "enable_kubeflow_training_operators" { @@ -253,15 +253,23 @@ variable "enable_kubeflow_training_operators" { default = true } +# task governance helm chart values variable "enable_cluster_role_and_bindings" { description = "Whether to enable the cluster role and bindings" type = bool - default = true + default = false } + variable "enable_namespaced_role_and_bindings" { description = "Whether to enable the namespaced role and bindings" type = bool - default = true + default = false +} + +variable "enable_team_role_and_bindings" { + description = "Whether to enable the team role and binding" + type = bool + default = false } variable "enable_nvidia_device_plugin" { From d804b3b263999242645fcff70e704c61562652cf Mon Sep 17 00:00:00 2001 From: bluecrayon52 <16687465+bluecrayon52@users.noreply.github.com> Date: Sun, 23 Nov 2025 17:04:40 -0600 Subject: [PATCH 2/2] simplified logic for disabling s3_bucket module, added conditional outputs for s3_bucket module --- .../terraform-modules/hyperpod-eks-tf/main.tf | 3 +-- .../terraform-modules/hyperpod-eks-tf/outputs.tf | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/main.tf b/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/main.tf index 7cff18f19..0241b7941 100644 --- a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/main.tf +++ b/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/main.tf @@ -13,8 +13,7 @@ locals { vpc_id = var.create_vpc_module ? module.vpc[0].vpc_id : var.existing_vpc_id private_subnet_id = var.create_private_subnet_module ? module.private_subnet[0].private_subnet_id : var.existing_private_subnet_id security_group_id = var.create_security_group_module ? module.security_group[0].security_group_id : var.existing_security_group_id - which_s3_bucket_name = var.create_s3_bucket_module ? module.s3_bucket[0].s3_bucket_name : var.existing_s3_bucket_name - s3_bucket_name = !local.rig_mode ? local.which_s3_bucket_name : null + s3_bucket_name = !local.rig_mode ? (var.create_s3_bucket_module ? module.s3_bucket[0].s3_bucket_name : var.existing_s3_bucket_name) : null eks_cluster_name = var.create_eks_module ? module.eks_cluster[0].eks_cluster_name : var.existing_eks_cluster_name sagemaker_iam_role_name = var.create_sagemaker_iam_role_module ? module.sagemaker_iam_role[0].sagemaker_iam_role_name : var.existing_sagemaker_iam_role_name deploy_hyperpod = var.create_hyperpod_module && !(var.create_eks_module && !var.create_helm_chart_module) diff --git a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/outputs.tf b/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/outputs.tf index d7e558a4d..e97235a0b 100644 --- a/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/outputs.tf +++ b/1.architectures/7.sagemaker-hyperpod-eks/terraform-modules/hyperpod-eks-tf/outputs.tf @@ -66,12 +66,12 @@ output "eks_cluster_certificate_authority" { # S3 Bucket Outputs output "s3_bucket_name" { description = "Name of the S3 bucket" - value = var.create_s3_bucket_module ? module.s3_bucket[0].s3_bucket_name : var.existing_s3_bucket_name + value = !local.rig_mode ? (var.create_s3_bucket_module ? module.s3_bucket[0].s3_bucket_name : var.existing_s3_bucket_name) : null } output "s3_bucket_arn" { description = "ARN of the S3 bucket" - value = var.create_s3_bucket_module ? module.s3_bucket[0].s3_bucket_arn : (var.existing_s3_bucket_name != "" ? data.aws_s3_bucket.existing_s3_bucket[0].arn : null) + value = !local.rig_mode ? (var.create_s3_bucket_module ? module.s3_bucket[0].s3_bucket_arn : (var.existing_s3_bucket_name != "" ? data.aws_s3_bucket.existing_s3_bucket[0].arn : null)) : null } # S3 VPC Endpoints Outputs