Skip to content

Bottlerocket GPU nodes do not join the cluster using Managed node groups in version 20.X #3287

@PMBerrigan

Description

@PMBerrigan

I had a previous ticket #3257 opened with code using the module which was unproducible for the team so I have stripped a lot of it down for it to be easily used without referencing other Terraform files. The issue arises when using any 20.X version of the EKS module when you try to deploy Bottlerocket GPU nodes into a managed node group within AWS.

If your request is for a new feature, please use the Feature request template.

  • ✋ I have searched the open/closed issues and my issue is not listed.

⚠️ Note

Before you submit an issue, please perform the following first:

  1. Remove the local .terraform directory (! ONLY if state is stored remotely, which hopefully you are following that best practice!): rm -rf .terraform/
  2. Re-initialize the project root to pull down modules: terraform init
  3. Re-attempt your terraform plan or apply and check if the issue still persists

Versions

  • Module version [Required]:

  • Terraform version:
    required_version = ">= 1.4.5"

  • Provider version(s):
    aws version = ">= 5.3.0"

Reproduction Code [Required]

module "eks" {
  source  = "terraform-aws-modules/eks/aws"
  version = "20.33.1"

  cluster_name    = "generic-cluster"
  cluster_version = "1.31"

  cluster_addons = {
    coredns = {
      most_recent = true
    }
    kube-proxy = {
      most_recent = true
    }
    vpc-cni = {
      most_recent = true
    }
  }

  cluster_endpoint_public_access               = true
  enable_cluster_creator_admin_permissions     = true
  vpc_id                                       = "vpc-0ddd59d5afa9f25a1"
  subnet_ids                                   = ["subnet-043a23d8ef5e4c894", "subnet-0796b40c721f92045", "subnet-0cb5cf126b0e694b2"]
  control_plane_subnet_ids                     = ["subnet-043a23d8ef5e4c894", "subnet-0796b40c721f92045", "subnet-0cb5cf126b0e694b2"]

  eks_managed_node_group_defaults = {
    instance_types = ["c7i.2xlarge", "g5.2xlarge"]
  }

  eks_managed_node_groups = {
    bigbang_generic = {
      ami_type       = "BOTTLEROCKET_x86_64"
      instance_types = ["c7i.2xlarge"]

      min_size             = 1
      max_size             = 10
      desired_size         = 1
      bootstrap_extra_args = <<-EOT
          [settings.host-containers.admin]
          enabled = false
          [settings.host-containers.control]
          enabled = true
          [settings.kernel]
          lockdown = "integrity"
          [settings.kubernetes]
          cluster-name = "${module.eks.cluster_name}"
          api-server = "${module.eks.cluster_endpoint}"
          cluster-certificate = "${module.eks.cluster_certificate_authority_data}"
        EOT

      block_device_mappings = {
        xvda = {
          device_name = "/dev/xvda"
          ebs = {
            volume_size           = 2
            volume_type           = "gp3"
            iops                  = 3000
            throughput            = 150
            encrypted             = true
            delete_on_termination = true
          }
        }
        xvdb = {
          device_name = "/dev/xvdb"
          ebs = {
            volume_size           = 500
            volume_type           = "gp3"
            iops                  = 3000
            throughput            = 150
            encrypted             = true
            delete_on_termination = true
          }
        }
      }

      metadata_options = {
        http_endpoint               = "enabled"
        http_tokens                 = "required"
        http_put_response_hop_limit = 2
        instance_metadata_tags      = "disabled"
      }

      create_iam_role = true
      iam_role_name   = "generic-eks-managed-node-group"
      iam_role_use_name_prefix = false
      iam_role_description     = "EKS managed node group for generic role"
      iam_role_tags = {
        Purpose = "Protector of the kubelet"
      }

      iam_role_additional_policies = {
        AmazonEC2ContainerRegistryReadOnly = "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly"
        AdditionalPolicy                   = aws_iam_policy.node_additional.arn
        AmazonEc2FullAccess                = "arn:aws:iam::aws:policy/AmazonEC2FullAccess"
        CloudWatchLogsFullAccess           = "arn:aws:iam::aws:policy/CloudWatchLogsFullAccess"
        SecretsManagerReadWrite            = "arn:aws:iam::aws:policy/SecretsManagerReadWrite"
      }
    },

    bigbang_gpu = {
      ami_type       = "BOTTLEROCKET_x86_64_NVIDIA"
      instance_types = ["g5.2xlarge"]

      min_size             = 1
      max_size             = 10
      desired_size         = 1
      bootstrap_extra_args = <<-EOT
          [settings.host-containers.admin]
          enabled = false
          [settings.host-containers.control]
          enabled = true
          [settings.kernel]
          lockdown = "integrity"
          [settings.kubernetes.node-labels]
          "bottlerocket.aws/updater-interface-version" = "2.0.0"
          [settings.kubernetes]
          cluster-name = "${module.eks.cluster_name}"
          api-server = "${module.eks.cluster_endpoint}"
          cluster-certificate = "${module.eks.cluster_certificate_authority_data}"
        EOT

      create_iam_role = false
      iam_role_arn    = "arn:aws:iam::XXXXXXXXXXX:role/generic-eks-managed-node-group"
    }
  }

  tags = {
    Environment = "dev"
    Terraform   = "true"
  }
}

resource "aws_iam_policy" "node_additional" {
  name        = "generic-cluster-additional"
  description = "Example usage of node additional policy"

  policy = jsonencode({
    Version = "2012-10-17"
    Statement = [
      {
        Sid      = "EC2DescribeAccess",
        Effect   = "Allow",
        Action   = "ec2:Describe*",
        Resource = "*"
      },
      {
        Sid    = "AllowSSMManagedInstanceCore",
        Effect = "Allow",
        Action = [
          "ssm:DescribeAssociation",
          "ssm:GetDeployablePatchSnapshotForInstance",
          "ssm:GetDocument",
          "ssm:DescribeDocument",
          "ssm:GetManifest",
          "ssm:GetParameter",
          "ssm:GetParameters",
          "ssm:ListAssociations",
          "ssm:ListInstanceAssociations",
          "ssm:PutInventory",
          "ssm:PutComplianceItems",
          "ssm:PutConfigurePackageResult",
          "ssm:UpdateAssociationStatus",
          "ssm:UpdateInstanceAssociationStatus",
          "ssm:UpdateInstanceInformation"
        ],
        Resource = "*"
      },
      {
        Sid    = "AllowSSMMessages",
        Effect = "Allow",
        Action = [
          "ssmmessages:CreateControlChannel",
          "ssmmessages:CreateDataChannel",
          "ssmmessages:OpenControlChannel",
          "ssmmessages:OpenDataChannel"
        ],
        Resource = "*"
      },
      {
        Sid    = "AllowEC2Messages",
        Effect = "Allow",
        Action = [
          "ec2messages:AcknowledgeMessage",
          "ec2messages:DeleteMessage",
          "ec2messages:FailMessage",
          "ec2messages:GetEndpoint",
          "ec2messages:GetMessages",
          "ec2messages:SendReply"
        ],
        Resource = "*"
      }
    ]
  })
}

Now here's the 19.21.0 version code:

module "eks" {
  source  = "terraform-aws-modules/eks/aws"
  version = "19.21.0"

  cluster_name    = "generic-cluster"
  cluster_version = "1.31"

  cluster_addons = {
    coredns = {
      most_recent = true
    }
    kube-proxy = {
      most_recent = true
    }
    vpc-cni = {
      most_recent = true
    }
  }

  cluster_endpoint_public_access               = true
  vpc_id                                       = "vpc-0ddd59d5afa9f25a1"
  subnet_ids                                   = ["subnet-043a23d8ef5e4c894", "subnet-0796b40c721f92045", "subnet-0cb5cf126b0e694b2"]
  control_plane_subnet_ids                     = ["subnet-043a23d8ef5e4c894", "subnet-0796b40c721f92045", "subnet-0cb5cf126b0e694b2"]

  eks_managed_node_group_defaults = {
    instance_types = ["c7i.2xlarge", "g5.2xlarge"]
  }

  eks_managed_node_groups = {
    bigbang_generic = {
      ami_type       = "BOTTLEROCKET_x86_64"
      instance_types = ["c7i.2xlarge"]

      min_size             = 1
      max_size             = 10
      desired_size         = 1
      bootstrap_extra_args = <<-EOT
          [settings.host-containers.admin]
          enabled = false
          [settings.host-containers.control]
          enabled = true
          [settings.kernel]
          lockdown = "integrity"
          [settings.kubernetes]
          cluster-name = "${module.eks.cluster_name}"
          api-server = "${module.eks.cluster_endpoint}"
          cluster-certificate = "${module.eks.cluster_certificate_authority_data}"
        EOT

      block_device_mappings = {
        xvda = {
          device_name = "/dev/xvda"
          ebs = {
            volume_size           = 2
            volume_type           = "gp3"
            iops                  = 3000
            throughput            = 150
            encrypted             = true
            delete_on_termination = true
          }
        }
        xvdb = {
          device_name = "/dev/xvdb"
          ebs = {
            volume_size           = 500
            volume_type           = "gp3"
            iops                  = 3000
            throughput            = 150
            encrypted             = true
            delete_on_termination = true
          }
        }
      }

      metadata_options = {
        http_endpoint               = "enabled"
        http_tokens                 = "required"
        http_put_response_hop_limit = 2
        instance_metadata_tags      = "disabled"
      }

      create_iam_role = true
      iam_role_name   = "generic-eks-managed-node-group"
      iam_role_use_name_prefix = false
      iam_role_description     = "EKS managed node group for generic role"
      iam_role_tags = {
        Purpose = "Protector of the kubelet"
      }

      iam_role_additional_policies = {
        AmazonEC2ContainerRegistryReadOnly = "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly"
        AdditionalPolicy                   = aws_iam_policy.node_additional.arn
        AmazonEc2FullAccess                = "arn:aws:iam::aws:policy/AmazonEC2FullAccess"
        CloudWatchLogsFullAccess           = "arn:aws:iam::aws:policy/CloudWatchLogsFullAccess"
        SecretsManagerReadWrite            = "arn:aws:iam::aws:policy/SecretsManagerReadWrite"
      }
    },

    bigbang_gpu = {
      ami_type       = "BOTTLEROCKET_x86_64_NVIDIA"
      instance_types = ["g5.2xlarge"]

      min_size             = 1
      max_size             = 10
      desired_size         = 1
      bootstrap_extra_args = <<-EOT
          [settings.host-containers.admin]
          enabled = false
          [settings.host-containers.control]
          enabled = true
          [settings.kernel]
          lockdown = "integrity"
          [settings.kubernetes.node-labels]
          "bottlerocket.aws/updater-interface-version" = "2.0.0"
          [settings.kubernetes]
          cluster-name = "${module.eks.cluster_name}"
          api-server = "${module.eks.cluster_endpoint}"
          cluster-certificate = "${module.eks.cluster_certificate_authority_data}"
        EOT

      create_iam_role = false
      iam_role_arn    = "arn:aws:iam::971870020263:role/generic-eks-managed-node-group"
    }
  }

  tags = {
    Environment = "dev"
    Terraform   = "true"
  }
}

resource "aws_iam_policy" "node_additional" {
  name        = "generic-cluster-additional"
  description = "Example usage of node additional policy"

  policy = jsonencode({
    Version = "2012-10-17"
    Statement = [
      {
        Sid      = "EC2DescribeAccess",
        Effect   = "Allow",
        Action   = "ec2:Describe*",
        Resource = "*"
      },
      {
        Sid    = "AllowSSMManagedInstanceCore",
        Effect = "Allow",
        Action = [
          "ssm:DescribeAssociation",
          "ssm:GetDeployablePatchSnapshotForInstance",
          "ssm:GetDocument",
          "ssm:DescribeDocument",
          "ssm:GetManifest",
          "ssm:GetParameter",
          "ssm:GetParameters",
          "ssm:ListAssociations",
          "ssm:ListInstanceAssociations",
          "ssm:PutInventory",
          "ssm:PutComplianceItems",
          "ssm:PutConfigurePackageResult",
          "ssm:UpdateAssociationStatus",
          "ssm:UpdateInstanceAssociationStatus",
          "ssm:UpdateInstanceInformation"
        ],
        Resource = "*"
      },
      {
        Sid    = "AllowSSMMessages",
        Effect = "Allow",
        Action = [
          "ssmmessages:CreateControlChannel",
          "ssmmessages:CreateDataChannel",
          "ssmmessages:OpenControlChannel",
          "ssmmessages:OpenDataChannel"
        ],
        Resource = "*"
      },
      {
        Sid    = "AllowEC2Messages",
        Effect = "Allow",
        Action = [
          "ec2messages:AcknowledgeMessage",
          "ec2messages:DeleteMessage",
          "ec2messages:FailMessage",
          "ec2messages:GetEndpoint",
          "ec2messages:GetMessages",
          "ec2messages:SendReply"
        ],
        Resource = "*"
      }
    ]
  })
}

Steps to reproduce the behavior:

I am running this code via Visual studio connecting into my AWS Account.

The module version 19.21.0 works fine for both Bottlerocket CPU and GPU nodes with this exact same code just with the version changed.

Expected behavior

The expected behavior is that both AWS Managed Node groups successfully create and the nodes are joined to the EKS cluster.

Actual behavior

When using EKS module version 20.33.1 the Bottlerocket CPU nodes join the EKS cluster without an issue but the GPU nodes do not.

Terminal Output Screenshot(s)

When using version 20.33.1
Image

When using version 19.21.0

Image

Additional context

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions