From 9c32b479b7ab4545c1e06d1e9acc51ed0e926246 Mon Sep 17 00:00:00 2001 From: elijah quinones Date: Wed, 14 Aug 2024 19:01:14 +0000 Subject: [PATCH] Fix accelerators not being considered when counting allocatables --- hack/generate-gpu-count-table.sh | 3 +- hack/generate-instance-store-table.sh | 3 +- pkg/cloud/volume_limits.go | 43 ++++++--- pkg/driver/node.go | 12 ++- pkg/driver/node_test.go | 121 ++++++++++++++++++++++---- 5 files changed, 147 insertions(+), 35 deletions(-) diff --git a/hack/generate-gpu-count-table.sh b/hack/generate-gpu-count-table.sh index 00ae0e4c03..ada3762131 100755 --- a/hack/generate-gpu-count-table.sh +++ b/hack/generate-gpu-count-table.sh @@ -15,6 +15,7 @@ # Generates gpu table for `pkg/cloud/volume_limits.go` from the AWS API # Ensure you are opted into all opt-in regions before running # Ensure your account isn't in any private instance type betas before running +# We are exluding the g5.48xlarge instance type as it is a special case that does not comply to regular ebs volume limit calculations set -euo pipefail @@ -34,4 +35,4 @@ function get_all_gpus() { done } -get_all_gpus | sort | uniq +get_all_gpus | sort | uniq | grep -v "g5.48xlarge" diff --git a/hack/generate-instance-store-table.sh b/hack/generate-instance-store-table.sh index f94e3cba37..471085e7e5 100755 --- a/hack/generate-instance-store-table.sh +++ b/hack/generate-instance-store-table.sh @@ -17,6 +17,7 @@ # Generates instance store table for `pkg/cloud/volume_limits.go` from the AWS API # Ensure you are opted into all opt-in regions before running # Ensure your account isn't in any private instance type betas before running +# We are exluding the g5.48xlarge instance type as it is a special case that does not comply to regular ebs volume limit calculations set -euo pipefail @@ -36,4 +37,4 @@ function get_all_instance_stores() { done } -get_all_instance_stores | sort | uniq +get_all_instance_stores | sort | uniq | grep -v "g5.48xlarge" diff --git a/pkg/cloud/volume_limits.go b/pkg/cloud/volume_limits.go index b925e5bdf7..9b9b393ddd 100644 --- a/pkg/cloud/volume_limits.go +++ b/pkg/cloud/volume_limits.go @@ -19,7 +19,7 @@ import ( "strings" ) -// / https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instance-types.html#ec2-nitro-instances +// https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instance-types.html#ec2-nitro-instances const ( highMemoryMetalInstancesMaxVolumes = 19 highMemoryVirtualInstancesMaxVolumes = 27 @@ -51,7 +51,7 @@ func init() { var dedicatedVolumeLimits = map[string]int{} -// / List of nitro instance types can be found here: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instance-types.html#ec2-nitro-instances +// List of nitro instance types can be found here: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instance-types.html#ec2-nitro-instances var nonNitroInstanceFamilies = map[string]struct{}{ "t2": {}, "c3": {}, @@ -67,6 +67,7 @@ var nonNitroInstanceFamilies = map[string]struct{}{ "g3": {}, "d2": {}, "h1": {}, + "f1": {}, } func IsNitroInstanceType(it string) bool { @@ -88,8 +89,8 @@ func GetMaxAttachments(nitro bool) int { return nonNitroMaxAttachments } -// / Some instance types have a maximum limit of EBS volumes -// / https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/volume_limits.html +// Some instance types have a maximum limit of EBS volumes +// https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/volume_limits.html var maxVolumeLimits = map[string]int{ "d3.8xlarge": 3, "d3en.12xlarge": 3, @@ -149,12 +150,16 @@ func GetReservedSlotsForInstanceType(it string) int { if ok { total += gpus } + acceleratorSlots, ok := acceleratorSlotsTaken[it] + if ok { + total += acceleratorSlots + } return total } -// / https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instance-store-volumes.html -// / IMDS does not provide NVMe instance store data; we'll just list all instances here -// / TODO: See if we can get these values from DescribeInstanceTypes API +// https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instance-store-volumes.html +// IMDS does not provide NVMe instance store data; we'll just list all instances here +// g5.48xlarge is not added to this table as it is in the maxVolumeLimits var nvmeInstanceStoreVolumes = map[string]int{ "c1.medium": 1, "c1.xlarge": 4, @@ -242,7 +247,6 @@ var nvmeInstanceStoreVolumes = map[string]int{ "g5.16xlarge": 1, "g5.24xlarge": 1, "g5.2xlarge": 1, - "g5.48xlarge": 2, "g5.4xlarge": 1, "g5.8xlarge": 1, "g5.xlarge": 1, @@ -497,7 +501,9 @@ var nvmeInstanceStoreVolumes = map[string]int{ "z1d.xlarge": 1, } -// / https://aws.amazon.com/ec2/instance-types +// https://aws.amazon.com/ec2/instance-types +// Despite the dl1.24xlarge having Gaudi Accelerators describe instance types considers them GPUs as such that instacne type is in this table +// g5.48xlarge is not added to this table as it is in the maxVolumeLimits var gpuInstanceGpus = map[string]int{ "dl1.24xlarge": 8, "g3.16xlarge": 4, @@ -520,7 +526,6 @@ var gpuInstanceGpus = map[string]int{ "g5.16xlarge": 1, "g5.24xlarge": 4, "g5.2xlarge": 1, - "g5.48xlarge": 8, "g5.4xlarge": 1, "g5.8xlarge": 1, "g5g.16xlarge": 2, @@ -551,3 +556,21 @@ var gpuInstanceGpus = map[string]int{ "p4de.24xlarge": 8, "p5.48xlarge": 8, } + +// Note this table is not a reflection of how many accelerators an instance has but of how many slots their combined acclerators take up +// VT instance type accelerators take two slots each with the exception of the vt1.24xlarge which takes 0 slots for its accelerators +// inf1 instance types are purposely not added to this table as they are in the maxVolumeLimits table +// https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/volume_limits.html +var acceleratorSlotsTaken = map[string]int{ + "vt1.3xlarge": 2, + "vt1.6xlarge": 4, + "vt1.24xlarge": 0, + "dl2q.24xlarge": 8, + "inf2.xlarge": 1, + "inf2.8xlarge": 1, + "inf2.24xlarge": 6, + "inf2.48xlarge": 12, + "trn1.2xlarge": 1, + "trn1.32xlarge": 16, + "trn1n.32xlarge": 16, +} diff --git a/pkg/driver/node.go b/pkg/driver/node.go index 80979c04bd..69c65aa605 100644 --- a/pkg/driver/node.go +++ b/pkg/driver/node.go @@ -781,8 +781,8 @@ func (d *NodeService) getVolumesLimit() int64 { } dedicatedLimit := cloud.GetDedicatedLimitForInstanceType(instanceType) - maxEBSAttachments, ok := cloud.GetEBSLimitForInstanceType(instanceType) - if ok { + maxEBSAttachments, hasMaxVolumeLimit := cloud.GetEBSLimitForInstanceType(instanceType) + if hasMaxVolumeLimit { availableAttachments = min(maxEBSAttachments, availableAttachments) } // For special dedicated limit instance types, the limit is only for EBS volumes @@ -791,8 +791,12 @@ func (d *NodeService) getVolumesLimit() int64 { availableAttachments = dedicatedLimit } else if isNitro { enis := d.metadata.GetNumAttachedENIs() - nvmeInstanceStoreVolumes := cloud.GetReservedSlotsForInstanceType(instanceType) - availableAttachments = availableAttachments - enis - nvmeInstanceStoreVolumes + reservedSlots := cloud.GetReservedSlotsForInstanceType(instanceType) + if hasMaxVolumeLimit { + availableAttachments = availableAttachments - (enis - 1) - reservedSlots + } else { + availableAttachments = availableAttachments - enis - reservedSlots + } } availableAttachments = availableAttachments - reservedVolumeAttachments if availableAttachments <= 0 { diff --git a/pkg/driver/node_test.go b/pkg/driver/node_test.go index 1abe1c6517..92e441345f 100644 --- a/pkg/driver/node_test.go +++ b/pkg/driver/node_test.go @@ -1136,23 +1136,87 @@ func TestGetVolumesLimit(t *testing.T) { }, }, { - name: "inf1.24xlarge_volume_attach_limit", + name: "mac1.metal_volume_attach_limit", options: &Options{ VolumeAttachLimit: -1, ReservedVolumeAttachments: -1, }, - expectedVal: 9, + expectedVal: 15, metadataMock: func(ctrl *gomock.Controller) *metadata.MockMetadataService { m := metadata.NewMockMetadataService(ctrl) m.EXPECT().GetRegion().Return("us-west-2") - m.EXPECT().GetInstanceType().Return("inf1.24xlarge") + m.EXPECT().GetInstanceType().Return("mac1.metal") + m.EXPECT().GetNumBlockDeviceMappings().Return(0) m.EXPECT().GetNumAttachedENIs().Return(1) + return m + }, + }, + { + name: "u-12tb1.metal_volume_attach_limit", + options: &Options{ + VolumeAttachLimit: -1, + ReservedVolumeAttachments: -1, + }, + expectedVal: 18, + metadataMock: func(ctrl *gomock.Controller) *metadata.MockMetadataService { + m := metadata.NewMockMetadataService(ctrl) + m.EXPECT().GetRegion().Return("us-west-2") + m.EXPECT().GetInstanceType().Return("u-12tb1.metal") m.EXPECT().GetNumBlockDeviceMappings().Return(0) + m.EXPECT().GetNumAttachedENIs().Return(1) return m }, }, { - name: "mac1.metal_volume_attach_limit", + name: "g4dn.xlarge_volume_attach_limit (1 GPU 1 InstanceStoreVolume)", + options: &Options{ + VolumeAttachLimit: -1, + ReservedVolumeAttachments: -1, + }, + expectedVal: 24, + metadataMock: func(ctrl *gomock.Controller) *metadata.MockMetadataService { + m := metadata.NewMockMetadataService(ctrl) + m.EXPECT().GetRegion().Return("us-west-2") + m.EXPECT().GetInstanceType().Return("g4dn.xlarge") + m.EXPECT().GetNumBlockDeviceMappings().Return(0) + m.EXPECT().GetNumAttachedENIs().Return(1) + return m + }, + }, + { + name: "g4ad.xlarge_volume_attach_limit (1 GPU 1 InstanceStoreVolume)", + options: &Options{ + VolumeAttachLimit: -1, + ReservedVolumeAttachments: -1, + }, + expectedVal: 24, + metadataMock: func(ctrl *gomock.Controller) *metadata.MockMetadataService { + m := metadata.NewMockMetadataService(ctrl) + m.EXPECT().GetRegion().Return("us-west-2") + m.EXPECT().GetInstanceType().Return("g4ad.xlarge") + m.EXPECT().GetNumBlockDeviceMappings().Return(0) + m.EXPECT().GetNumAttachedENIs().Return(1) + return m + }, + }, + { + name: "g4dn.12xlarge_volume_attach_limit (4 GPUS, 1 InstanceStoreVolume)", + options: &Options{ + VolumeAttachLimit: -1, + ReservedVolumeAttachments: -1, + }, + expectedVal: 21, + metadataMock: func(ctrl *gomock.Controller) *metadata.MockMetadataService { + m := metadata.NewMockMetadataService(ctrl) + m.EXPECT().GetRegion().Return("us-west-2") + m.EXPECT().GetInstanceType().Return("g4dn.12xlarge") + m.EXPECT().GetNumBlockDeviceMappings().Return(0) + m.EXPECT().GetNumAttachedENIs().Return(1) + return m + }, + }, + { + name: "dl1.24xlarge_volume_attach_limit (8 Accelerator slots , 4 InstanceStoreVolume)", options: &Options{ VolumeAttachLimit: -1, ReservedVolumeAttachments: -1, @@ -1161,73 +1225,92 @@ func TestGetVolumesLimit(t *testing.T) { metadataMock: func(ctrl *gomock.Controller) *metadata.MockMetadataService { m := metadata.NewMockMetadataService(ctrl) m.EXPECT().GetRegion().Return("us-west-2") - m.EXPECT().GetInstanceType().Return("mac1.metal") + m.EXPECT().GetInstanceType().Return("dl1.24xlarge") m.EXPECT().GetNumBlockDeviceMappings().Return(0) m.EXPECT().GetNumAttachedENIs().Return(1) return m }, }, { - name: "u-12tb1.metal_volume_attach_limit", + // will and should fail if g5.48xlarge instance type is in any table other than maxVolumeLimits table + name: "g5.48xlarge_volume_attach_limit (Instance has attached GPUs and NVMe Instance Store volumes but should be ignored for EBS volume limits calculation)", options: &Options{ VolumeAttachLimit: -1, ReservedVolumeAttachments: -1, }, - expectedVal: 17, + expectedVal: 8, metadataMock: func(ctrl *gomock.Controller) *metadata.MockMetadataService { m := metadata.NewMockMetadataService(ctrl) m.EXPECT().GetRegion().Return("us-west-2") - m.EXPECT().GetInstanceType().Return("u-12tb1.metal") + m.EXPECT().GetInstanceType().Return("g5.48xlarge") m.EXPECT().GetNumBlockDeviceMappings().Return(0) m.EXPECT().GetNumAttachedENIs().Return(1) return m }, }, { - name: "g4dn.xlarge_volume_attach_limit (1 GPU 1 InstanceStoreVolume)", + // Should fail if inf1.xlarge instance type is in any table other than maxVolumeLimits table + name: "inf1.xlarge_volume_attach_limit (Instance has attached Accelerators but should be ignored for EBS volume limits calculation)", options: &Options{ VolumeAttachLimit: -1, ReservedVolumeAttachments: -1, }, - expectedVal: 24, + expectedVal: 25, metadataMock: func(ctrl *gomock.Controller) *metadata.MockMetadataService { m := metadata.NewMockMetadataService(ctrl) m.EXPECT().GetRegion().Return("us-west-2") - m.EXPECT().GetInstanceType().Return("g4dn.xlarge") + m.EXPECT().GetInstanceType().Return("inf1.xlarge") m.EXPECT().GetNumBlockDeviceMappings().Return(0) m.EXPECT().GetNumAttachedENIs().Return(1) return m }, }, - // 1 gpu { - name: "g4ad.xlarge_volume_attach_limit (1 GPU 1 InstanceStoreVolume)", + // Should fail if inf1.xlarge instance type is in any table other than maxVolumeLimits table + name: "inf1.2xlarge_volume_attach_limit (Instance has attached Accelerators but should be ignored for EBS volume limits calculation)", options: &Options{ VolumeAttachLimit: -1, ReservedVolumeAttachments: -1, }, - expectedVal: 24, + expectedVal: 25, metadataMock: func(ctrl *gomock.Controller) *metadata.MockMetadataService { m := metadata.NewMockMetadataService(ctrl) m.EXPECT().GetRegion().Return("us-west-2") - m.EXPECT().GetInstanceType().Return("g4ad.xlarge") + m.EXPECT().GetInstanceType().Return("inf1.2xlarge") m.EXPECT().GetNumBlockDeviceMappings().Return(0) m.EXPECT().GetNumAttachedENIs().Return(1) return m }, }, - // 4 gpus { - name: "g4dn.12xlarge_volume_attach_limit (4 GPUS, 1 InstanceStoreVolume)", + // Should fail if inf1.6xlarge instance type is in any table other than maxVolumeLimits table + name: "inf1.6xlarge_volume_attach_limit (Instance has attached Accelerators but should be ignored for EBS volume limits calculation)", options: &Options{ VolumeAttachLimit: -1, ReservedVolumeAttachments: -1, }, - expectedVal: 21, + expectedVal: 22, metadataMock: func(ctrl *gomock.Controller) *metadata.MockMetadataService { m := metadata.NewMockMetadataService(ctrl) m.EXPECT().GetRegion().Return("us-west-2") - m.EXPECT().GetInstanceType().Return("g4dn.12xlarge") + m.EXPECT().GetInstanceType().Return("inf1.6xlarge") + m.EXPECT().GetNumBlockDeviceMappings().Return(0) + m.EXPECT().GetNumAttachedENIs().Return(1) + return m + }, + }, + { + // Should fail if inf1.24xlarge instance type is in any table other than maxVolumeLimits table + name: "inf1.24xlarge_volume_attach_limit (Instance has attached Accelerators but should be ignored for EBS volume limits calculation)", + options: &Options{ + VolumeAttachLimit: -1, + ReservedVolumeAttachments: -1, + }, + expectedVal: 10, + metadataMock: func(ctrl *gomock.Controller) *metadata.MockMetadataService { + m := metadata.NewMockMetadataService(ctrl) + m.EXPECT().GetRegion().Return("us-west-2") + m.EXPECT().GetInstanceType().Return("inf1.24xlarge") m.EXPECT().GetNumBlockDeviceMappings().Return(0) m.EXPECT().GetNumAttachedENIs().Return(1) return m