Skip to content

Commit

Permalink
DRA: add DRAControlPlaneController feature gate for "classic DRA"
Browse files Browse the repository at this point in the history
In the API, the effect of the feature gate is that alpha fields get dropped on
create. They get preserved during updates if already set. The
PodSchedulingContext registration is *not* restricted by the feature gate.
This enables deleting stale PodSchedulingContext objects after disabling
the feature gate.

The scheduler checks the new feature gate before setting up an informer for
PodSchedulingContext objects and when deciding whether it can schedule a
pod. If any claim depends on a control plane controller, the scheduler bails
out, leading to:

    Status:       Pending
    ...
      Warning  FailedScheduling             73s   default-scheduler  0/1 nodes are available: resourceclaim depends on disabled DRAControlPlaneController feature. no new claims to deallocate, preemption: 0/1 nodes are available: 1 Preemption is not helpful for scheduling.

The rest of the changes prepare for testing the new feature separately from
"structured parameters". The goal is to have base "dra" jobs which just enable
and test those, then "classic-dra" jobs which add DRAControlPlaneController.
  • Loading branch information
pohly committed Jul 16, 2024
1 parent 0862d7b commit 40a45e6
Show file tree
Hide file tree
Showing 14 changed files with 648 additions and 98 deletions.
14 changes: 13 additions & 1 deletion pkg/features/kube_features.go
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,17 @@ const (
// alpha: v1.26
//
// Enables support for resources with custom parameters and a lifecycle
// that is independent of a Pod.
// that is independent of a Pod. Resource allocation is done by a DRA driver's
// "control plane controller" in cooperation with the scheduler.
DRAControlPlaneController featuregate.Feature = "DRAControlPlaneController"

// owner: @pohly
// kep: http://kep.k8s.io/4381
// alpha: v1.29
//
// Enables support for resources with custom parameters and a lifecycle
// that is independent of a Pod. Resource allocation is done by the scheduler
// based on "structured parameters".
DynamicResourceAllocation featuregate.Feature = "DynamicResourceAllocation"

// owner: @harche
Expand Down Expand Up @@ -1027,6 +1037,8 @@ var defaultKubernetesFeatureGates = map[featuregate.Feature]featuregate.FeatureS

DevicePluginCDIDevices: {Default: true, PreRelease: featuregate.GA, LockToDefault: true}, // remove in 1.33

DRAControlPlaneController: {Default: false, PreRelease: featuregate.Alpha},

DynamicResourceAllocation: {Default: false, PreRelease: featuregate.Alpha},

EventedPLEG: {Default: false, PreRelease: featuregate.Alpha},
Expand Down
24 changes: 24 additions & 0 deletions pkg/registry/resource/deviceclass/strategy.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,11 @@ import (
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/util/validation/field"
"k8s.io/apiserver/pkg/storage/names"
utilfeature "k8s.io/apiserver/pkg/util/feature"
"k8s.io/kubernetes/pkg/api/legacyscheme"
"k8s.io/kubernetes/pkg/apis/resource"
"k8s.io/kubernetes/pkg/apis/resource/validation"
"k8s.io/kubernetes/pkg/features"
)

// deviceClassStrategy implements behavior for DeviceClass objects
Expand All @@ -43,6 +45,7 @@ func (deviceClassStrategy) NamespaceScoped() bool {
func (deviceClassStrategy) PrepareForCreate(ctx context.Context, obj runtime.Object) {
class := obj.(*resource.DeviceClass)
class.Generation = 1
dropDisabledFields(class, nil)
}

func (deviceClassStrategy) Validate(ctx context.Context, obj runtime.Object) field.ErrorList {
Expand All @@ -65,6 +68,8 @@ func (deviceClassStrategy) PrepareForUpdate(ctx context.Context, obj, old runtim
class := obj.(*resource.DeviceClass)
oldClass := old.(*resource.DeviceClass)

dropDisabledFields(class, oldClass)

// Any changes to the spec increment the generation number.
if !apiequality.Semantic.DeepEqual(oldClass.Spec, class.Spec) {
class.Generation = oldClass.Generation + 1
Expand All @@ -83,3 +88,22 @@ func (deviceClassStrategy) WarningsOnUpdate(ctx context.Context, obj, old runtim
func (deviceClassStrategy) AllowUnconditionalUpdate() bool {
return true
}

// dropDisabledFields removes fields which are covered by the optional DRAControlPlaneController feature gate.
func dropDisabledFields(newClass, oldClass *resource.DeviceClass) {
if utilfeature.DefaultFeatureGate.Enabled(features.DRAControlPlaneController) {
// No need to drop anything.
return
}

if oldClass == nil {
// Always drop on create.
newClass.Spec.SuitableNodes = nil
return
}

// Drop on update only if not already set.
if oldClass.Spec.SuitableNodes == nil {
newClass.Spec.SuitableNodes = nil
}
}
189 changes: 153 additions & 36 deletions pkg/registry/resource/deviceclass/strategy_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,18 +19,42 @@ package deviceclass
import (
"testing"

"github.com/stretchr/testify/assert"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
genericapirequest "k8s.io/apiserver/pkg/endpoints/request"
utilfeature "k8s.io/apiserver/pkg/util/feature"
featuregatetesting "k8s.io/component-base/featuregate/testing"
"k8s.io/kubernetes/pkg/apis/core"
"k8s.io/kubernetes/pkg/apis/resource"
"k8s.io/kubernetes/pkg/features"
)

var deviceClass = &resource.DeviceClass{
var obj = &resource.DeviceClass{
ObjectMeta: metav1.ObjectMeta{
Name: "valid-class",
Generation: 1,
},
}

var objWithGatedFields = &resource.DeviceClass{
ObjectMeta: metav1.ObjectMeta{
Name: "valid-class",
Name: "valid-class",
Generation: 1,
},
Spec: resource.DeviceClassSpec{
SuitableNodes: &core.NodeSelector{
NodeSelectorTerms: []core.NodeSelectorTerm{{
MatchExpressions: []core.NodeSelectorRequirement{{
Key: "foo",
Operator: core.NodeSelectorOpExists,
}},
}},
},
},
}

func TestClassStrategy(t *testing.T) {
func TestStrategy(t *testing.T) {
if Strategy.NamespaceScoped() {
t.Errorf("DeviceClass must not be namespace scoped")
}
Expand All @@ -39,42 +63,135 @@ func TestClassStrategy(t *testing.T) {
}
}

func TestClassStrategyCreate(t *testing.T) {
func TestStrategyCreate(t *testing.T) {
ctx := genericapirequest.NewDefaultContext()
deviceClass := deviceClass.DeepCopy()

Strategy.PrepareForCreate(ctx, deviceClass)
errs := Strategy.Validate(ctx, deviceClass)
if len(errs) != 0 {
t.Errorf("unexpected error validating for create %v", errs)
testcases := map[string]struct {
obj *resource.DeviceClass
controlPlaneController bool
expectValidationError bool
expectObj *resource.DeviceClass
}{
"simple": {
obj: obj,
expectObj: obj,
},
"validation-error": {
obj: func() *resource.DeviceClass {
obj := obj.DeepCopy()
obj.Name = "%#@$%$"
return obj
}(),
expectValidationError: true,
},
"drop-fields": {
obj: objWithGatedFields,
controlPlaneController: false,
expectObj: obj,
},
"keep-fields": {
obj: objWithGatedFields,
controlPlaneController: true,
expectObj: objWithGatedFields,
},
}

for name, tc := range testcases {
t.Run(name, func(t *testing.T) {
featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.DRAControlPlaneController, tc.controlPlaneController)

obj := tc.obj.DeepCopy()
Strategy.PrepareForCreate(ctx, obj)
if errs := Strategy.Validate(ctx, obj); len(errs) != 0 {
if !tc.expectValidationError {
t.Fatalf("unexpected validation errors: %q", errs)
}
return
} else if tc.expectValidationError {
t.Fatal("expected validation error(s), got none")
}
if warnings := Strategy.WarningsOnCreate(ctx, obj); len(warnings) != 0 {
t.Fatalf("unexpected warnings: %q", warnings)
}
Strategy.Canonicalize(obj)
assert.Equal(t, tc.expectObj, obj)
})
}
}

func TestClassStrategyUpdate(t *testing.T) {
t.Run("no-changes-okay", func(t *testing.T) {
ctx := genericapirequest.NewDefaultContext()
deviceClass := deviceClass.DeepCopy()
newClass := deviceClass.DeepCopy()
newClass.ResourceVersion = "4"

Strategy.PrepareForUpdate(ctx, newClass, deviceClass)
errs := Strategy.ValidateUpdate(ctx, newClass, deviceClass)
if len(errs) != 0 {
t.Errorf("unexpected validation errors: %v", errs)
}
})

t.Run("name-change-not-allowed", func(t *testing.T) {
ctx := genericapirequest.NewDefaultContext()
deviceClass := deviceClass.DeepCopy()
newClass := deviceClass.DeepCopy()
newClass.Name = "valid-class-2"
newClass.ResourceVersion = "4"

Strategy.PrepareForUpdate(ctx, newClass, deviceClass)
errs := Strategy.ValidateUpdate(ctx, newClass, deviceClass)
if len(errs) == 0 {
t.Errorf("expected a validation error")
}
})
func TestStrategyUpdate(t *testing.T) {
ctx := genericapirequest.NewDefaultContext()

testcases := map[string]struct {
oldObj *resource.DeviceClass
newObj *resource.DeviceClass
controlPlaneController bool
expectValidationError bool
expectObj *resource.DeviceClass
}{
"no-changes-okay": {
oldObj: obj,
newObj: obj,
expectObj: obj,
},
"name-change-not-allowed": {
oldObj: obj,
newObj: func() *resource.DeviceClass {
obj := obj.DeepCopy()
obj.Name += "-2"
return obj
}(),
expectValidationError: true,
},
"drop-fields": {
oldObj: obj,
newObj: objWithGatedFields,
controlPlaneController: false,
expectObj: obj,
},
"keep-fields": {
oldObj: obj,
newObj: objWithGatedFields,
controlPlaneController: true,
expectObj: func() *resource.DeviceClass {
obj := objWithGatedFields.DeepCopy()
// Spec changes -> generation gets bumped.
obj.Generation++
return obj
}(),
},
"keep-existing-fields": {
oldObj: objWithGatedFields,
newObj: objWithGatedFields,
controlPlaneController: false,
expectObj: objWithGatedFields,
},
}

for name, tc := range testcases {
t.Run(name, func(t *testing.T) {
featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.DRAControlPlaneController, tc.controlPlaneController)
oldObj := tc.oldObj.DeepCopy()
newObj := tc.newObj.DeepCopy()
newObj.ResourceVersion = "4"

Strategy.PrepareForUpdate(ctx, newObj, oldObj)
if errs := Strategy.ValidateUpdate(ctx, newObj, oldObj); len(errs) != 0 {
if !tc.expectValidationError {
t.Fatalf("unexpected validation errors: %q", errs)
}
return
} else if tc.expectValidationError {
t.Fatal("expected validation error(s), got none")
}
if warnings := Strategy.WarningsOnUpdate(ctx, newObj, oldObj); len(warnings) != 0 {
t.Fatalf("unexpected warnings: %q", warnings)
}
Strategy.Canonicalize(newObj)

expectObj := tc.expectObj.DeepCopy()
expectObj.ResourceVersion = "4"
assert.Equal(t, expectObj, newObj)
})
}
}
41 changes: 41 additions & 0 deletions pkg/registry/resource/resourceclaim/strategy.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,11 @@ import (
"k8s.io/apiserver/pkg/registry/generic"
"k8s.io/apiserver/pkg/storage"
"k8s.io/apiserver/pkg/storage/names"
utilfeature "k8s.io/apiserver/pkg/util/feature"
"k8s.io/kubernetes/pkg/api/legacyscheme"
"k8s.io/kubernetes/pkg/apis/resource"
"k8s.io/kubernetes/pkg/apis/resource/validation"
"k8s.io/kubernetes/pkg/features"
"sigs.k8s.io/structured-merge-diff/v4/fieldpath"
)

Expand Down Expand Up @@ -65,6 +67,8 @@ func (resourceclaimStrategy) PrepareForCreate(ctx context.Context, obj runtime.O
claim := obj.(*resource.ResourceClaim)
// Status must not be set by user on create.
claim.Status = resource.ResourceClaimStatus{}

dropDisabledFields(claim, nil)
}

func (resourceclaimStrategy) Validate(ctx context.Context, obj runtime.Object) field.ErrorList {
Expand All @@ -87,6 +91,8 @@ func (resourceclaimStrategy) PrepareForUpdate(ctx context.Context, obj, old runt
newClaim := obj.(*resource.ResourceClaim)
oldClaim := old.(*resource.ResourceClaim)
newClaim.Status = oldClaim.Status

dropDisabledFields(newClaim, oldClaim)
}

func (resourceclaimStrategy) ValidateUpdate(ctx context.Context, obj, old runtime.Object) field.ErrorList {
Expand Down Expand Up @@ -127,6 +133,8 @@ func (resourceclaimStatusStrategy) PrepareForUpdate(ctx context.Context, obj, ol
oldClaim := old.(*resource.ResourceClaim)
newClaim.Spec = oldClaim.Spec
metav1.ResetObjectMetaForStatus(&newClaim.ObjectMeta, &oldClaim.ObjectMeta)

dropDisabledFields(newClaim, oldClaim)
}

func (resourceclaimStatusStrategy) ValidateUpdate(ctx context.Context, obj, old runtime.Object) field.ErrorList {
Expand Down Expand Up @@ -163,3 +171,36 @@ func toSelectableFields(claim *resource.ResourceClaim) fields.Set {
fields := generic.ObjectMetaFieldsSet(&claim.ObjectMeta, true)
return fields
}

// dropDisabledFields removes fields which are covered by the optional DRAControlPlaneController feature gate.
func dropDisabledFields(newClaim, oldClaim *resource.ResourceClaim) {
if utilfeature.DefaultFeatureGate.Enabled(features.DRAControlPlaneController) {
// No need to drop anything.
return
}

if oldClaim == nil {
// Always drop on create. There's no status yet, so nothing to do there.
newClaim.Spec.Controller = ""
return
}

// Drop on (status) update only if not already set.
if oldClaim.Spec.Controller == "" {
newClaim.Spec.Controller = ""
}
// If the claim is handled by a control plane controller, allow
// setting it also in the status. Stripping that field would be bad.
if oldClaim.Spec.Controller == "" &&
newClaim.Status.Allocation != nil &&
oldClaim.Status.Allocation == nil &&
(oldClaim.Status.Allocation == nil || oldClaim.Status.Allocation.Controller == "") {
newClaim.Status.Allocation.Controller = ""
}
// If there is an existing allocation which used a control plane controller, then
// allow requesting its deallocation.
if !oldClaim.Status.DeallocationRequested &&
(newClaim.Status.Allocation == nil || newClaim.Status.Allocation.Controller == "") {
newClaim.Status.DeallocationRequested = false
}
}
Loading

0 comments on commit 40a45e6

Please sign in to comment.