Skip to content

Commit db0f65c

Browse files
committed
Separate backOff policy for static pod
Most static pods run as critical components. When an exception occurs and a restart is required, the sooner the better, so a separate backoff policy is set for static pods.
1 parent 1740d85 commit db0f65c

File tree

3 files changed

+16
-4
lines changed

3 files changed

+16
-4
lines changed

pkg/kubelet/kubelet.go

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -873,6 +873,7 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration,
873873
volumepathhandler.NewBlockVolumePathHandler())
874874

875875
klet.backOff = flowcontrol.NewBackOff(backOffPeriod, MaxContainerBackOff)
876+
klet.staticBackOff = flowcontrol.NewBackOff(time.Second*3, time.Second*10)
876877

877878
// setup eviction manager
878879
evictionManager, evictionAdmitHandler := eviction.NewManager(klet.resourceAnalyzer, evictionConfig,
@@ -1259,6 +1260,8 @@ type Kubelet struct {
12591260

12601261
// Container restart Backoff
12611262
backOff *flowcontrol.Backoff
1263+
// static pod Backoff
1264+
staticBackOff *flowcontrol.Backoff
12621265

12631266
// Information about the ports which are opened by daemons on Node running this Kubelet server.
12641267
daemonEndpoints *v1.NodeDaemonEndpoints
@@ -1940,7 +1943,14 @@ func (kl *Kubelet) SyncPod(ctx context.Context, updateType kubetypes.SyncPodType
19401943
// Use WithoutCancel instead of a new context.TODO() to propagate trace context
19411944
// Call the container runtime's SyncPod callback
19421945
sctx := context.WithoutCancel(ctx)
1943-
result := kl.containerRuntime.SyncPod(sctx, pod, podStatus, pullSecrets, kl.backOff)
1946+
backOff := kl.backOff
1947+
if pod.Annotations != nil && pod.Annotations[kubetypes.ConfigSourceAnnotationKey] == "file" {
1948+
klog.V(5).InfoS("use static backOff for pod. ", "pod", klog.KObj(pod))
1949+
backOff = kl.staticBackOff
1950+
} else {
1951+
klog.V(5).InfoS("use normal backOff for pod. ", "pod", klog.KObj(pod))
1952+
}
1953+
result := kl.containerRuntime.SyncPod(sctx, pod, podStatus, pullSecrets, backOff)
19441954
kl.reasonCache.Update(pod.UID, result)
19451955
if err := result.Error(); err != nil {
19461956
// Do not return error if the only failures were pods in backoff

pkg/kubelet/kubelet_pods.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1370,6 +1370,7 @@ func (kl *Kubelet) HandlePodCleanups(ctx context.Context) error {
13701370

13711371
// Cleanup any backoff entries.
13721372
kl.backOff.GC()
1373+
kl.staticBackOff.GC()
13731374
return nil
13741375
}
13751376

pkg/kubelet/kubelet_test.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,14 +35,12 @@ import (
3535
"go.opentelemetry.io/otel/sdk/trace/tracetest"
3636
oteltrace "go.opentelemetry.io/otel/trace"
3737
noopoteltrace "go.opentelemetry.io/otel/trace/noop"
38+
"k8s.io/utils/mount"
3839

3940
cadvisorapi "github.com/google/cadvisor/info/v1"
4041
cadvisorapiv2 "github.com/google/cadvisor/info/v2"
4142
"github.com/stretchr/testify/assert"
4243
"github.com/stretchr/testify/require"
43-
core "k8s.io/client-go/testing"
44-
"k8s.io/mount-utils"
45-
4644
v1 "k8s.io/api/core/v1"
4745
"k8s.io/apimachinery/pkg/api/resource"
4846
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -52,6 +50,7 @@ import (
5250
"k8s.io/apimachinery/pkg/util/wait"
5351
utilfeature "k8s.io/apiserver/pkg/util/feature"
5452
"k8s.io/client-go/kubernetes/fake"
53+
core "k8s.io/client-go/testing"
5554
"k8s.io/client-go/tools/record"
5655
"k8s.io/client-go/util/flowcontrol"
5756
featuregatetesting "k8s.io/component-base/featuregate/testing"
@@ -330,7 +329,9 @@ func newTestKubeletWithImageList(
330329

331330
fakeClock := testingclock.NewFakeClock(time.Now())
332331
kubelet.backOff = flowcontrol.NewBackOff(time.Second, time.Minute)
332+
kubelet.staticBackOff = flowcontrol.NewBackOff(time.Second, time.Minute)
333333
kubelet.backOff.Clock = fakeClock
334+
kubelet.staticBackOff.Clock = fakeClock
334335
kubelet.resyncInterval = 10 * time.Second
335336
kubelet.workQueue = queue.NewBasicWorkQueue(fakeClock)
336337
// Relist period does not affect the tests.

0 commit comments

Comments
 (0)