Fix startup probe race condition for sidecar containers

AadiDev005 · AadiDev005 · commit 0a26f03236c1 · 2025-08-05T12:09:01.000+05:30
Fixes a bug where startup probe workers terminate incorrectly for sidecar
containers with restartPolicy=Always when the pod has restartPolicy=Never,
causing main containers to remain stuck in "Initializing" state.

Changes:
- Add container-level restart policy check for init containers only
- Extract complex boolean logic to named variable for readability
- Refactor test helper to use existing newWorker() function
- Fix terminology: RestartPolicyNone → RestartPolicyNever
- Add comprehensive unit and e2e tests for both scenarios
diff --git a/pkg/kubelet/prober/worker.go b/pkg/kubelet/prober/worker.go
@@ -81,6 +81,16 @@ type worker struct {
 	proberDurationUnknownMetricLabels    metrics.Labels
 }
 
+// isInitContainer checks if the worker's container is in the pod's init containers
+func (w *worker) isInitContainer() bool {
+	for _, initContainer := range w.pod.Spec.InitContainers {
+		if initContainer.Name == w.container.Name {
+			return true
+		}
+	}
+	return false
+}
+
 // Creates and starts a new probe worker.
 func newWorker(
 	m *manager,
@@ -251,9 +261,14 @@ func (w *worker) doProbe(ctx context.Context) (keepGoing bool) {
 		if !w.containerID.IsEmpty() {
 			w.resultsManager.Set(w.containerID, results.Failure, w.pod)
 		}
+
+		isRestartableInitContainer := w.isInitContainer() &&
+			w.container.RestartPolicy != nil && *w.container.RestartPolicy == v1.ContainerRestartPolicyAlways
+
 		// Abort if the container will not be restarted.
 		return c.State.Terminated == nil ||
-			w.pod.Spec.RestartPolicy != v1.RestartPolicyNever
+			w.pod.Spec.RestartPolicy != v1.RestartPolicyNever ||
+			isRestartableInitContainer
 	}
 
 	// Graceful shutdown of the pod.
diff --git a/pkg/kubelet/prober/worker_test.go b/pkg/kubelet/prober/worker_test.go
@@ -37,6 +37,25 @@ import (
 func init() {
 }
 
+// newTestWorkerWithRestartableInitContainer creates a test worker with an init container setup
+func newTestWorkerWithRestartableInitContainer(m *manager, probeType probeType) *worker {
+	pod := getTestPod()
+
+	// Set up init container with restart policy
+	initContainer := pod.Spec.Containers[0]
+	initContainer.Name = testContainerName
+	restartPolicy := v1.ContainerRestartPolicyAlways
+	initContainer.RestartPolicy = &restartPolicy
+
+	// Move container to init containers and add a regular container
+	pod.Spec.InitContainers = []v1.Container{initContainer}
+	pod.Spec.Containers = []v1.Container{{
+		Name: "main-container",
+	}}
+
+	return newWorker(m, probeType, pod, initContainer)
+}
+
 func TestDoProbe(t *testing.T) {
 	logger, ctx := ktesting.NewTestContext(t)
 	m := newTestManager()
@@ -530,6 +549,72 @@ func TestResultRunOnStartupCheckFailure(t *testing.T) {
 	}
 }
 
+func TestDoProbe_TerminatedRestartableInitContainerWithRestartPolicyNever(t *testing.T) {
+	logger, ctx := ktesting.NewTestContext(t)
+	m := newTestManager()
+
+	// Test restartable init container (sidecar) behavior
+	w := newTestWorkerWithRestartableInitContainer(m, startup)
+
+	// Set pod restart policy to Never
+	w.pod.Spec.RestartPolicy = v1.RestartPolicyNever
+
+	// Create terminated status for init container
+	terminatedStatus := getTestRunningStatus()
+	terminatedStatus.InitContainerStatuses = []v1.ContainerStatus{{
+		Name:        testContainerName,
+		ContainerID: "test://test_container_id",
+		State: v1.ContainerState{
+			Terminated: &v1.ContainerStateTerminated{
+				StartedAt: metav1.Now(),
+			},
+		},
+	}}
+	terminatedStatus.ContainerStatuses[0].State.Running = nil
+
+	m.statusManager.SetPodStatus(logger, w.pod, terminatedStatus)
+
+	// Test: Terminated restartable init container with restartPolicy=Always should continue probing
+	// even when pod has restartPolicy=Never
+	if !w.doProbe(ctx) {
+		t.Error("Expected to continue probing for terminated restartable init container with pod restart policy Never")
+	}
+
+	// Verify result is set to Failure for terminated container
+	expectResult(t, w, results.Failure, "restartable init container with pod restart policy Never")
+}
+
+func TestDoProbe_TerminatedContainerWithRestartPolicyNever(t *testing.T) {
+	logger, ctx := ktesting.NewTestContext(t)
+	m := newTestManager()
+
+	// Test that regular containers (without RestartPolicy) still work as before
+	w := newTestWorker(m, startup, v1.Probe{})
+
+	// Regular container without explicit restart policy
+	w.container.RestartPolicy = nil
+
+	// Set pod restart policy to Never
+	w.pod.Spec.RestartPolicy = v1.RestartPolicyNever
+
+	// Create terminated status
+	terminatedStatus := getTestRunningStatus()
+	terminatedStatus.ContainerStatuses[0].State.Running = nil
+	terminatedStatus.ContainerStatuses[0].State.Terminated = &v1.ContainerStateTerminated{
+		StartedAt: metav1.Now(),
+	}
+
+	m.statusManager.SetPodStatus(logger, w.pod, terminatedStatus)
+
+	// Should stop probing (existing behavior preserved)
+	if w.doProbe(ctx) {
+		t.Error("Expected to stop probing for regular container with pod RestartPolicy=Never")
+	}
+
+	// Verify result is set to Failure for terminated container
+	expectResult(t, w, results.Failure, "regular container with pod restart policy Never")
+}
+
 func TestLivenessProbeDisabledByStarted(t *testing.T) {
 	logger, ctx := ktesting.NewTestContext(t)
 	m := newTestManager()
diff --git a/test/e2e_node/container_lifecycle_test.go b/test/e2e_node/container_lifecycle_test.go
@@ -5905,7 +5905,113 @@ var _ = SIGDescribe(feature.SidecarContainers, "Containers Lifecycle", func() {
 				})
 			})
 		})
+
 	})
+
+	ginkgo.When("A restartable init container with startup probe fails initially", func() {
+		ginkgo.It("should continue probing and allow regular container to start after the restartable init container recovers", func(ctx context.Context) {
+			restartableInit := "buggy-restartable-init"
+			regularContainer := "regular-container"
+
+			restartPolicyAlways := v1.ContainerRestartPolicyAlways
+
+			pod := &v1.Pod{
+				ObjectMeta: metav1.ObjectMeta{
+					Name: "restartable-init-startup-probe-fix",
+				},
+				Spec: v1.PodSpec{
+					RestartPolicy: v1.RestartPolicyNever,
+					InitContainers: []v1.Container{{
+						Name:          restartableInit,
+						Image:         busyboxImage,
+						RestartPolicy: &restartPolicyAlways,
+						Command: []string{"sh", "-c", `
+	if [ ! -f /persistent/first_run_done ]; then
+	  echo 'First run: creating marker and exiting with 1'
+	  touch /persistent/first_run_done
+	  exit 1
+	else
+	  echo 'Second run: marker found, running as sidecar'
+	  sleep 120
+	fi`},
+						StartupProbe: &v1.Probe{
+							InitialDelaySeconds: 3,
+							PeriodSeconds:       2,
+							FailureThreshold:    10,
+							ProbeHandler: v1.ProbeHandler{
+								Exec: &v1.ExecAction{
+									Command: []string{"/bin/true"},
+								},
+							},
+						},
+					}},
+					Containers: []v1.Container{{
+						Name:  regularContainer,
+						Image: imageutils.GetPauseImageName(),
+						StartupProbe: &v1.Probe{
+							InitialDelaySeconds: 5,
+							PeriodSeconds:       5,
+							ProbeHandler: v1.ProbeHandler{
+								Exec: &v1.ExecAction{
+									Command: []string{"/bin/true"},
+								},
+							},
+						},
+					}},
+				},
+			}
+
+			preparePod(pod)
+			client := e2epod.NewPodClient(f)
+			pod = client.Create(ctx, pod)
+
+			ginkgo.By("Waiting for init container to fail and restart at least once")
+			framework.ExpectNoError(
+				e2epod.WaitForPodCondition(ctx, f.ClientSet, pod.Namespace, pod.Name,
+					"restartable init restarted", 90*time.Second,
+					func(p *v1.Pod) (bool, error) {
+						for _, st := range p.Status.InitContainerStatuses {
+							if st.Name == restartableInit && st.RestartCount > 0 {
+								framework.Logf("Init container %s has restarted %d times", restartableInit, st.RestartCount)
+								return true, nil
+							}
+						}
+						return false, nil
+					}),
+			)
+
+			ginkgo.By("Waiting for init container to be running after restart")
+			framework.ExpectNoError(
+				e2epod.WaitForPodCondition(ctx, f.ClientSet, pod.Namespace, pod.Name,
+					"restartable init running", 90*time.Second,
+					func(p *v1.Pod) (bool, error) {
+						for _, st := range p.Status.InitContainerStatuses {
+							if st.Name == restartableInit && st.State.Running != nil {
+								framework.Logf("Init container %s is now running", restartableInit)
+								return true, nil
+							}
+						}
+						return false, nil
+					}),
+			)
+
+			ginkgo.By("Waiting for regular container to start")
+			framework.ExpectNoError(
+				e2epod.WaitForPodCondition(ctx, f.ClientSet, pod.Namespace, pod.Name,
+					"regular container running", 120*time.Second,
+					func(p *v1.Pod) (bool, error) {
+						for _, st := range p.Status.ContainerStatuses {
+							if st.Name == regularContainer && st.State.Running != nil {
+								framework.Logf("Regular container %s is running", regularContainer)
+								return true, nil
+							}
+						}
+						return false, nil
+					}),
+			)
+		})
+	})
+
 })
 
 var _ = SIGDescribe(feature.SidecarContainers, framework.WithSerial(), "Containers Lifecycle", func() {