Skip to content

Commit 0a26f03

Browse files
committed
Fix startup probe race condition for sidecar containers
Fixes a bug where startup probe workers terminate incorrectly for sidecar containers with restartPolicy=Always when the pod has restartPolicy=Never, causing main containers to remain stuck in "Initializing" state. Changes: - Add container-level restart policy check for init containers only - Extract complex boolean logic to named variable for readability - Refactor test helper to use existing newWorker() function - Fix terminology: RestartPolicyNone → RestartPolicyNever - Add comprehensive unit and e2e tests for both scenarios
1 parent 792061a commit 0a26f03

File tree

3 files changed

+207
-1
lines changed

3 files changed

+207
-1
lines changed

pkg/kubelet/prober/worker.go

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,16 @@ type worker struct {
8181
proberDurationUnknownMetricLabels metrics.Labels
8282
}
8383

84+
// isInitContainer checks if the worker's container is in the pod's init containers
85+
func (w *worker) isInitContainer() bool {
86+
for _, initContainer := range w.pod.Spec.InitContainers {
87+
if initContainer.Name == w.container.Name {
88+
return true
89+
}
90+
}
91+
return false
92+
}
93+
8494
// Creates and starts a new probe worker.
8595
func newWorker(
8696
m *manager,
@@ -251,9 +261,14 @@ func (w *worker) doProbe(ctx context.Context) (keepGoing bool) {
251261
if !w.containerID.IsEmpty() {
252262
w.resultsManager.Set(w.containerID, results.Failure, w.pod)
253263
}
264+
265+
isRestartableInitContainer := w.isInitContainer() &&
266+
w.container.RestartPolicy != nil && *w.container.RestartPolicy == v1.ContainerRestartPolicyAlways
267+
254268
// Abort if the container will not be restarted.
255269
return c.State.Terminated == nil ||
256-
w.pod.Spec.RestartPolicy != v1.RestartPolicyNever
270+
w.pod.Spec.RestartPolicy != v1.RestartPolicyNever ||
271+
isRestartableInitContainer
257272
}
258273

259274
// Graceful shutdown of the pod.

pkg/kubelet/prober/worker_test.go

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,25 @@ import (
3737
func init() {
3838
}
3939

40+
// newTestWorkerWithRestartableInitContainer creates a test worker with an init container setup
41+
func newTestWorkerWithRestartableInitContainer(m *manager, probeType probeType) *worker {
42+
pod := getTestPod()
43+
44+
// Set up init container with restart policy
45+
initContainer := pod.Spec.Containers[0]
46+
initContainer.Name = testContainerName
47+
restartPolicy := v1.ContainerRestartPolicyAlways
48+
initContainer.RestartPolicy = &restartPolicy
49+
50+
// Move container to init containers and add a regular container
51+
pod.Spec.InitContainers = []v1.Container{initContainer}
52+
pod.Spec.Containers = []v1.Container{{
53+
Name: "main-container",
54+
}}
55+
56+
return newWorker(m, probeType, pod, initContainer)
57+
}
58+
4059
func TestDoProbe(t *testing.T) {
4160
logger, ctx := ktesting.NewTestContext(t)
4261
m := newTestManager()
@@ -530,6 +549,72 @@ func TestResultRunOnStartupCheckFailure(t *testing.T) {
530549
}
531550
}
532551

552+
func TestDoProbe_TerminatedRestartableInitContainerWithRestartPolicyNever(t *testing.T) {
553+
logger, ctx := ktesting.NewTestContext(t)
554+
m := newTestManager()
555+
556+
// Test restartable init container (sidecar) behavior
557+
w := newTestWorkerWithRestartableInitContainer(m, startup)
558+
559+
// Set pod restart policy to Never
560+
w.pod.Spec.RestartPolicy = v1.RestartPolicyNever
561+
562+
// Create terminated status for init container
563+
terminatedStatus := getTestRunningStatus()
564+
terminatedStatus.InitContainerStatuses = []v1.ContainerStatus{{
565+
Name: testContainerName,
566+
ContainerID: "test://test_container_id",
567+
State: v1.ContainerState{
568+
Terminated: &v1.ContainerStateTerminated{
569+
StartedAt: metav1.Now(),
570+
},
571+
},
572+
}}
573+
terminatedStatus.ContainerStatuses[0].State.Running = nil
574+
575+
m.statusManager.SetPodStatus(logger, w.pod, terminatedStatus)
576+
577+
// Test: Terminated restartable init container with restartPolicy=Always should continue probing
578+
// even when pod has restartPolicy=Never
579+
if !w.doProbe(ctx) {
580+
t.Error("Expected to continue probing for terminated restartable init container with pod restart policy Never")
581+
}
582+
583+
// Verify result is set to Failure for terminated container
584+
expectResult(t, w, results.Failure, "restartable init container with pod restart policy Never")
585+
}
586+
587+
func TestDoProbe_TerminatedContainerWithRestartPolicyNever(t *testing.T) {
588+
logger, ctx := ktesting.NewTestContext(t)
589+
m := newTestManager()
590+
591+
// Test that regular containers (without RestartPolicy) still work as before
592+
w := newTestWorker(m, startup, v1.Probe{})
593+
594+
// Regular container without explicit restart policy
595+
w.container.RestartPolicy = nil
596+
597+
// Set pod restart policy to Never
598+
w.pod.Spec.RestartPolicy = v1.RestartPolicyNever
599+
600+
// Create terminated status
601+
terminatedStatus := getTestRunningStatus()
602+
terminatedStatus.ContainerStatuses[0].State.Running = nil
603+
terminatedStatus.ContainerStatuses[0].State.Terminated = &v1.ContainerStateTerminated{
604+
StartedAt: metav1.Now(),
605+
}
606+
607+
m.statusManager.SetPodStatus(logger, w.pod, terminatedStatus)
608+
609+
// Should stop probing (existing behavior preserved)
610+
if w.doProbe(ctx) {
611+
t.Error("Expected to stop probing for regular container with pod RestartPolicy=Never")
612+
}
613+
614+
// Verify result is set to Failure for terminated container
615+
expectResult(t, w, results.Failure, "regular container with pod restart policy Never")
616+
}
617+
533618
func TestLivenessProbeDisabledByStarted(t *testing.T) {
534619
logger, ctx := ktesting.NewTestContext(t)
535620
m := newTestManager()

test/e2e_node/container_lifecycle_test.go

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5905,7 +5905,113 @@ var _ = SIGDescribe(feature.SidecarContainers, "Containers Lifecycle", func() {
59055905
})
59065906
})
59075907
})
5908+
59085909
})
5910+
5911+
ginkgo.When("A restartable init container with startup probe fails initially", func() {
5912+
ginkgo.It("should continue probing and allow regular container to start after the restartable init container recovers", func(ctx context.Context) {
5913+
restartableInit := "buggy-restartable-init"
5914+
regularContainer := "regular-container"
5915+
5916+
restartPolicyAlways := v1.ContainerRestartPolicyAlways
5917+
5918+
pod := &v1.Pod{
5919+
ObjectMeta: metav1.ObjectMeta{
5920+
Name: "restartable-init-startup-probe-fix",
5921+
},
5922+
Spec: v1.PodSpec{
5923+
RestartPolicy: v1.RestartPolicyNever,
5924+
InitContainers: []v1.Container{{
5925+
Name: restartableInit,
5926+
Image: busyboxImage,
5927+
RestartPolicy: &restartPolicyAlways,
5928+
Command: []string{"sh", "-c", `
5929+
if [ ! -f /persistent/first_run_done ]; then
5930+
echo 'First run: creating marker and exiting with 1'
5931+
touch /persistent/first_run_done
5932+
exit 1
5933+
else
5934+
echo 'Second run: marker found, running as sidecar'
5935+
sleep 120
5936+
fi`},
5937+
StartupProbe: &v1.Probe{
5938+
InitialDelaySeconds: 3,
5939+
PeriodSeconds: 2,
5940+
FailureThreshold: 10,
5941+
ProbeHandler: v1.ProbeHandler{
5942+
Exec: &v1.ExecAction{
5943+
Command: []string{"/bin/true"},
5944+
},
5945+
},
5946+
},
5947+
}},
5948+
Containers: []v1.Container{{
5949+
Name: regularContainer,
5950+
Image: imageutils.GetPauseImageName(),
5951+
StartupProbe: &v1.Probe{
5952+
InitialDelaySeconds: 5,
5953+
PeriodSeconds: 5,
5954+
ProbeHandler: v1.ProbeHandler{
5955+
Exec: &v1.ExecAction{
5956+
Command: []string{"/bin/true"},
5957+
},
5958+
},
5959+
},
5960+
}},
5961+
},
5962+
}
5963+
5964+
preparePod(pod)
5965+
client := e2epod.NewPodClient(f)
5966+
pod = client.Create(ctx, pod)
5967+
5968+
ginkgo.By("Waiting for init container to fail and restart at least once")
5969+
framework.ExpectNoError(
5970+
e2epod.WaitForPodCondition(ctx, f.ClientSet, pod.Namespace, pod.Name,
5971+
"restartable init restarted", 90*time.Second,
5972+
func(p *v1.Pod) (bool, error) {
5973+
for _, st := range p.Status.InitContainerStatuses {
5974+
if st.Name == restartableInit && st.RestartCount > 0 {
5975+
framework.Logf("Init container %s has restarted %d times", restartableInit, st.RestartCount)
5976+
return true, nil
5977+
}
5978+
}
5979+
return false, nil
5980+
}),
5981+
)
5982+
5983+
ginkgo.By("Waiting for init container to be running after restart")
5984+
framework.ExpectNoError(
5985+
e2epod.WaitForPodCondition(ctx, f.ClientSet, pod.Namespace, pod.Name,
5986+
"restartable init running", 90*time.Second,
5987+
func(p *v1.Pod) (bool, error) {
5988+
for _, st := range p.Status.InitContainerStatuses {
5989+
if st.Name == restartableInit && st.State.Running != nil {
5990+
framework.Logf("Init container %s is now running", restartableInit)
5991+
return true, nil
5992+
}
5993+
}
5994+
return false, nil
5995+
}),
5996+
)
5997+
5998+
ginkgo.By("Waiting for regular container to start")
5999+
framework.ExpectNoError(
6000+
e2epod.WaitForPodCondition(ctx, f.ClientSet, pod.Namespace, pod.Name,
6001+
"regular container running", 120*time.Second,
6002+
func(p *v1.Pod) (bool, error) {
6003+
for _, st := range p.Status.ContainerStatuses {
6004+
if st.Name == regularContainer && st.State.Running != nil {
6005+
framework.Logf("Regular container %s is running", regularContainer)
6006+
return true, nil
6007+
}
6008+
}
6009+
return false, nil
6010+
}),
6011+
)
6012+
})
6013+
})
6014+
59096015
})
59106016

59116017
var _ = SIGDescribe(feature.SidecarContainers, framework.WithSerial(), "Containers Lifecycle", func() {

0 commit comments

Comments
 (0)