-
-
Notifications
You must be signed in to change notification settings - Fork 4.2k
Description
I had a previous ticket #3257 opened with code using the module which was unproducible for the team so I have stripped a lot of it down for it to be easily used without referencing other Terraform files. The issue arises when using any 20.X version of the EKS module when you try to deploy Bottlerocket GPU nodes into a managed node group within AWS.
If your request is for a new feature, please use the Feature request
template.
- ✋ I have searched the open/closed issues and my issue is not listed.
⚠️ Note
Before you submit an issue, please perform the following first:
- Remove the local
.terraform
directory (! ONLY if state is stored remotely, which hopefully you are following that best practice!):rm -rf .terraform/
- Re-initialize the project root to pull down modules:
terraform init
- Re-attempt your terraform plan or apply and check if the issue still persists
Versions
-
Module version [Required]:
-
Terraform version:
required_version = ">= 1.4.5" -
Provider version(s):
aws version = ">= 5.3.0"
Reproduction Code [Required]
module "eks" {
source = "terraform-aws-modules/eks/aws"
version = "20.33.1"
cluster_name = "generic-cluster"
cluster_version = "1.31"
cluster_addons = {
coredns = {
most_recent = true
}
kube-proxy = {
most_recent = true
}
vpc-cni = {
most_recent = true
}
}
cluster_endpoint_public_access = true
enable_cluster_creator_admin_permissions = true
vpc_id = "vpc-0ddd59d5afa9f25a1"
subnet_ids = ["subnet-043a23d8ef5e4c894", "subnet-0796b40c721f92045", "subnet-0cb5cf126b0e694b2"]
control_plane_subnet_ids = ["subnet-043a23d8ef5e4c894", "subnet-0796b40c721f92045", "subnet-0cb5cf126b0e694b2"]
eks_managed_node_group_defaults = {
instance_types = ["c7i.2xlarge", "g5.2xlarge"]
}
eks_managed_node_groups = {
bigbang_generic = {
ami_type = "BOTTLEROCKET_x86_64"
instance_types = ["c7i.2xlarge"]
min_size = 1
max_size = 10
desired_size = 1
bootstrap_extra_args = <<-EOT
[settings.host-containers.admin]
enabled = false
[settings.host-containers.control]
enabled = true
[settings.kernel]
lockdown = "integrity"
[settings.kubernetes]
cluster-name = "${module.eks.cluster_name}"
api-server = "${module.eks.cluster_endpoint}"
cluster-certificate = "${module.eks.cluster_certificate_authority_data}"
EOT
block_device_mappings = {
xvda = {
device_name = "/dev/xvda"
ebs = {
volume_size = 2
volume_type = "gp3"
iops = 3000
throughput = 150
encrypted = true
delete_on_termination = true
}
}
xvdb = {
device_name = "/dev/xvdb"
ebs = {
volume_size = 500
volume_type = "gp3"
iops = 3000
throughput = 150
encrypted = true
delete_on_termination = true
}
}
}
metadata_options = {
http_endpoint = "enabled"
http_tokens = "required"
http_put_response_hop_limit = 2
instance_metadata_tags = "disabled"
}
create_iam_role = true
iam_role_name = "generic-eks-managed-node-group"
iam_role_use_name_prefix = false
iam_role_description = "EKS managed node group for generic role"
iam_role_tags = {
Purpose = "Protector of the kubelet"
}
iam_role_additional_policies = {
AmazonEC2ContainerRegistryReadOnly = "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly"
AdditionalPolicy = aws_iam_policy.node_additional.arn
AmazonEc2FullAccess = "arn:aws:iam::aws:policy/AmazonEC2FullAccess"
CloudWatchLogsFullAccess = "arn:aws:iam::aws:policy/CloudWatchLogsFullAccess"
SecretsManagerReadWrite = "arn:aws:iam::aws:policy/SecretsManagerReadWrite"
}
},
bigbang_gpu = {
ami_type = "BOTTLEROCKET_x86_64_NVIDIA"
instance_types = ["g5.2xlarge"]
min_size = 1
max_size = 10
desired_size = 1
bootstrap_extra_args = <<-EOT
[settings.host-containers.admin]
enabled = false
[settings.host-containers.control]
enabled = true
[settings.kernel]
lockdown = "integrity"
[settings.kubernetes.node-labels]
"bottlerocket.aws/updater-interface-version" = "2.0.0"
[settings.kubernetes]
cluster-name = "${module.eks.cluster_name}"
api-server = "${module.eks.cluster_endpoint}"
cluster-certificate = "${module.eks.cluster_certificate_authority_data}"
EOT
create_iam_role = false
iam_role_arn = "arn:aws:iam::XXXXXXXXXXX:role/generic-eks-managed-node-group"
}
}
tags = {
Environment = "dev"
Terraform = "true"
}
}
resource "aws_iam_policy" "node_additional" {
name = "generic-cluster-additional"
description = "Example usage of node additional policy"
policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Sid = "EC2DescribeAccess",
Effect = "Allow",
Action = "ec2:Describe*",
Resource = "*"
},
{
Sid = "AllowSSMManagedInstanceCore",
Effect = "Allow",
Action = [
"ssm:DescribeAssociation",
"ssm:GetDeployablePatchSnapshotForInstance",
"ssm:GetDocument",
"ssm:DescribeDocument",
"ssm:GetManifest",
"ssm:GetParameter",
"ssm:GetParameters",
"ssm:ListAssociations",
"ssm:ListInstanceAssociations",
"ssm:PutInventory",
"ssm:PutComplianceItems",
"ssm:PutConfigurePackageResult",
"ssm:UpdateAssociationStatus",
"ssm:UpdateInstanceAssociationStatus",
"ssm:UpdateInstanceInformation"
],
Resource = "*"
},
{
Sid = "AllowSSMMessages",
Effect = "Allow",
Action = [
"ssmmessages:CreateControlChannel",
"ssmmessages:CreateDataChannel",
"ssmmessages:OpenControlChannel",
"ssmmessages:OpenDataChannel"
],
Resource = "*"
},
{
Sid = "AllowEC2Messages",
Effect = "Allow",
Action = [
"ec2messages:AcknowledgeMessage",
"ec2messages:DeleteMessage",
"ec2messages:FailMessage",
"ec2messages:GetEndpoint",
"ec2messages:GetMessages",
"ec2messages:SendReply"
],
Resource = "*"
}
]
})
}
Now here's the 19.21.0 version code:
module "eks" {
source = "terraform-aws-modules/eks/aws"
version = "19.21.0"
cluster_name = "generic-cluster"
cluster_version = "1.31"
cluster_addons = {
coredns = {
most_recent = true
}
kube-proxy = {
most_recent = true
}
vpc-cni = {
most_recent = true
}
}
cluster_endpoint_public_access = true
vpc_id = "vpc-0ddd59d5afa9f25a1"
subnet_ids = ["subnet-043a23d8ef5e4c894", "subnet-0796b40c721f92045", "subnet-0cb5cf126b0e694b2"]
control_plane_subnet_ids = ["subnet-043a23d8ef5e4c894", "subnet-0796b40c721f92045", "subnet-0cb5cf126b0e694b2"]
eks_managed_node_group_defaults = {
instance_types = ["c7i.2xlarge", "g5.2xlarge"]
}
eks_managed_node_groups = {
bigbang_generic = {
ami_type = "BOTTLEROCKET_x86_64"
instance_types = ["c7i.2xlarge"]
min_size = 1
max_size = 10
desired_size = 1
bootstrap_extra_args = <<-EOT
[settings.host-containers.admin]
enabled = false
[settings.host-containers.control]
enabled = true
[settings.kernel]
lockdown = "integrity"
[settings.kubernetes]
cluster-name = "${module.eks.cluster_name}"
api-server = "${module.eks.cluster_endpoint}"
cluster-certificate = "${module.eks.cluster_certificate_authority_data}"
EOT
block_device_mappings = {
xvda = {
device_name = "/dev/xvda"
ebs = {
volume_size = 2
volume_type = "gp3"
iops = 3000
throughput = 150
encrypted = true
delete_on_termination = true
}
}
xvdb = {
device_name = "/dev/xvdb"
ebs = {
volume_size = 500
volume_type = "gp3"
iops = 3000
throughput = 150
encrypted = true
delete_on_termination = true
}
}
}
metadata_options = {
http_endpoint = "enabled"
http_tokens = "required"
http_put_response_hop_limit = 2
instance_metadata_tags = "disabled"
}
create_iam_role = true
iam_role_name = "generic-eks-managed-node-group"
iam_role_use_name_prefix = false
iam_role_description = "EKS managed node group for generic role"
iam_role_tags = {
Purpose = "Protector of the kubelet"
}
iam_role_additional_policies = {
AmazonEC2ContainerRegistryReadOnly = "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly"
AdditionalPolicy = aws_iam_policy.node_additional.arn
AmazonEc2FullAccess = "arn:aws:iam::aws:policy/AmazonEC2FullAccess"
CloudWatchLogsFullAccess = "arn:aws:iam::aws:policy/CloudWatchLogsFullAccess"
SecretsManagerReadWrite = "arn:aws:iam::aws:policy/SecretsManagerReadWrite"
}
},
bigbang_gpu = {
ami_type = "BOTTLEROCKET_x86_64_NVIDIA"
instance_types = ["g5.2xlarge"]
min_size = 1
max_size = 10
desired_size = 1
bootstrap_extra_args = <<-EOT
[settings.host-containers.admin]
enabled = false
[settings.host-containers.control]
enabled = true
[settings.kernel]
lockdown = "integrity"
[settings.kubernetes.node-labels]
"bottlerocket.aws/updater-interface-version" = "2.0.0"
[settings.kubernetes]
cluster-name = "${module.eks.cluster_name}"
api-server = "${module.eks.cluster_endpoint}"
cluster-certificate = "${module.eks.cluster_certificate_authority_data}"
EOT
create_iam_role = false
iam_role_arn = "arn:aws:iam::971870020263:role/generic-eks-managed-node-group"
}
}
tags = {
Environment = "dev"
Terraform = "true"
}
}
resource "aws_iam_policy" "node_additional" {
name = "generic-cluster-additional"
description = "Example usage of node additional policy"
policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Sid = "EC2DescribeAccess",
Effect = "Allow",
Action = "ec2:Describe*",
Resource = "*"
},
{
Sid = "AllowSSMManagedInstanceCore",
Effect = "Allow",
Action = [
"ssm:DescribeAssociation",
"ssm:GetDeployablePatchSnapshotForInstance",
"ssm:GetDocument",
"ssm:DescribeDocument",
"ssm:GetManifest",
"ssm:GetParameter",
"ssm:GetParameters",
"ssm:ListAssociations",
"ssm:ListInstanceAssociations",
"ssm:PutInventory",
"ssm:PutComplianceItems",
"ssm:PutConfigurePackageResult",
"ssm:UpdateAssociationStatus",
"ssm:UpdateInstanceAssociationStatus",
"ssm:UpdateInstanceInformation"
],
Resource = "*"
},
{
Sid = "AllowSSMMessages",
Effect = "Allow",
Action = [
"ssmmessages:CreateControlChannel",
"ssmmessages:CreateDataChannel",
"ssmmessages:OpenControlChannel",
"ssmmessages:OpenDataChannel"
],
Resource = "*"
},
{
Sid = "AllowEC2Messages",
Effect = "Allow",
Action = [
"ec2messages:AcknowledgeMessage",
"ec2messages:DeleteMessage",
"ec2messages:FailMessage",
"ec2messages:GetEndpoint",
"ec2messages:GetMessages",
"ec2messages:SendReply"
],
Resource = "*"
}
]
})
}
Steps to reproduce the behavior:
I am running this code via Visual studio connecting into my AWS Account.
The module version 19.21.0 works fine for both Bottlerocket CPU and GPU nodes with this exact same code just with the version changed.
Expected behavior
The expected behavior is that both AWS Managed Node groups successfully create and the nodes are joined to the EKS cluster.
Actual behavior
When using EKS module version 20.33.1 the Bottlerocket CPU nodes join the EKS cluster without an issue but the GPU nodes do not.
Terminal Output Screenshot(s)
When using version 19.21.0