Skip to content

Commit 516340b

Browse files
committed
Report actionable error when GC fails due to disk pressure
When the image GC fails to free enough space due to disk pressure, the user-visible event it generates is misleading (see #71869) - the technical detail leads operators to suspect GC problems. This PR makes the message actionable by focusing on the disk pressure. I had hoped to include the total disk space consumed by images that cannot be collected, to help operators quickly determine if disk pressure is caused by a large number of active images or by other files on the node, but this is blocked because the CRI API doesn't reveal the total disk use (depending on the runtime it has either the compressed or uncompressed space: #120698).
1 parent ff657e1 commit 516340b

File tree

3 files changed

+69
-11
lines changed

3 files changed

+69
-11
lines changed

pkg/kubelet/images/image_gc_manager.go

Lines changed: 35 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -391,15 +391,22 @@ func (im *realImageGCManager) GarbageCollect(ctx context.Context, beganGC time.T
391391
klog.InfoS("Disk usage on image filesystem is over the high threshold, trying to free bytes down to the low threshold", "usage", usagePercent, "highThreshold", im.policy.HighThresholdPercent, "amountToFree", amountToFree, "lowThreshold", im.policy.LowThresholdPercent)
392392
remainingImages, freed, err := im.freeSpace(ctx, amountToFree, freeTime, images)
393393
if err != nil {
394+
// Failed to delete images, eg due to a read-only filesystem.
394395
return err
395396
}
396397

397398
im.runPostGCHooks(remainingImages, freeTime)
398399

399400
if freed < amountToFree {
400-
err := fmt.Errorf("Failed to garbage collect required amount of images. Attempted to free %d bytes, but only found %d bytes eligible to free.", amountToFree, freed)
401-
im.recorder.Eventf(im.nodeRef, v1.EventTypeWarning, events.FreeDiskSpaceFailed, err.Error())
402-
return err
401+
// This usually means the disk is full for reasons other than container
402+
// images, such as logs, volumes, or other files. However, it could also
403+
// be due to an unusually large number or size of in-use container images.
404+
message := fmt.Sprintf("Insufficient free disk space on the node's image filesystem (%d%% of %s used). "+
405+
"Failed to free sufficient space by deleting unused images. "+
406+
"Investigate disk usage, as it could be used by active images, logs, volumes, or other data.",
407+
usagePercent, formatSize(capacity))
408+
im.recorder.Eventf(im.nodeRef, v1.EventTypeWarning, events.FreeDiskSpaceFailed, "%s", message)
409+
return fmt.Errorf("%s", message)
403410
}
404411
}
405412

@@ -576,6 +583,31 @@ func (im *realImageGCManager) imagesInEvictionOrder(ctx context.Context, freeTim
576583
return images, nil
577584
}
578585

586+
// formatSize returns a human-readable string for a given size in bytes.
587+
func formatSize(sizeBytes int64) string {
588+
const (
589+
KiB = 1024
590+
MiB = 1024 * KiB
591+
GiB = 1024 * MiB
592+
TiB = 1024 * GiB
593+
)
594+
595+
size := float64(sizeBytes)
596+
597+
switch {
598+
case size < KiB:
599+
return fmt.Sprintf("%d B", int64(size))
600+
case size < MiB:
601+
return fmt.Sprintf("%.1f KiB", size/KiB)
602+
case size < GiB:
603+
return fmt.Sprintf("%.1f MiB", size/MiB)
604+
case size < TiB:
605+
return fmt.Sprintf("%.1f GiB", size/GiB)
606+
default:
607+
return fmt.Sprintf("%.1f TiB", size/TiB)
608+
}
609+
}
610+
579611
// If RuntimeClassInImageCriAPI feature gate is enabled, imageRecords
580612
// are identified by a tuple of (imageId,runtimeHandler) that is passed
581613
// from ListImages() call. If no runtimehandler is specified in response

pkg/kubelet/images/image_gc_manager_test.go

Lines changed: 33 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -645,7 +645,7 @@ func TestGarbageCollectCadvisorFailure(t *testing.T) {
645645
manager, _ := newRealImageGCManager(policy, mockStatsProvider)
646646

647647
mockStatsProvider.EXPECT().ImageFsStats(mock.Anything).Return(&statsapi.FsStats{}, &statsapi.FsStats{}, fmt.Errorf("error"))
648-
assert.Error(t, manager.GarbageCollect(ctx, time.Now()))
648+
require.Error(t, manager.GarbageCollect(ctx, time.Now()))
649649
}
650650

651651
func TestGarbageCollectBelowSuccess(t *testing.T) {
@@ -678,19 +678,45 @@ func TestGarbageCollectNotEnoughFreed(t *testing.T) {
678678
LowThresholdPercent: 80,
679679
}
680680
mockStatsProvider := statstest.NewMockProvider(t)
681-
manager, fakeRuntime := newRealImageGCManager(policy, mockStatsProvider)
681+
fakeRuntime := &containertest.FakeRuntime{}
682+
recorder := record.NewFakeRecorder(1)
683+
manager := &realImageGCManager{
684+
runtime: fakeRuntime,
685+
policy: policy,
686+
imageRecords: make(map[string]*imageRecord),
687+
statsProvider: mockStatsProvider,
688+
recorder: recorder,
689+
tracer: noopoteltrace.NewTracerProvider().Tracer(""),
690+
}
682691

683-
// Expect 95% usage and little of it gets freed.
692+
// Initial state: 95% usage, of which 5% is garbage images.
693+
capacity := uint64(10 * 1024 * 1024 * 1024)
694+
available := capacity / 20 // 5% available, 95% usage
684695
imageFs := &statsapi.FsStats{
685-
AvailableBytes: ptr.To(uint64(50)),
686-
CapacityBytes: ptr.To(uint64(1000)),
696+
AvailableBytes: ptr.To(available),
697+
CapacityBytes: ptr.To(capacity),
687698
}
688699
mockStatsProvider.EXPECT().ImageFsStats(mock.Anything).Return(imageFs, imageFs, nil)
700+
// This image is unused and eligible for deletion.
701+
// Its size is less than the required amount to free.
702+
imageSize := int64(500 * 1024 * 1024) // 500 MiB
689703
fakeRuntime.ImageList = []container.Image{
690-
makeImage(0, 50),
704+
makeImage(0, imageSize),
691705
}
692706

693-
assert.Error(t, manager.GarbageCollect(ctx, time.Now()))
707+
err := manager.GarbageCollect(ctx, time.Now())
708+
require.Error(t, err)
709+
710+
// Check that a warning event was sent
711+
expectedEvent := "Warning FreeDiskSpaceFailed Insufficient free disk space on the node's image filesystem" +
712+
" (95% of 10.0 GiB used). Failed to free sufficient space by deleting unused images." +
713+
" Investigate disk usage, as it could be used by active images, logs, volumes, or other data."
714+
select {
715+
case event := <-recorder.Events:
716+
assert.Equal(t, expectedEvent, event)
717+
default:
718+
t.Errorf("expected a 'FreeDiskSpaceFailed' event")
719+
}
694720
}
695721

696722
func TestGarbageCollectImageNotOldEnough(t *testing.T) {

pkg/kubelet/kubelet.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1609,7 +1609,7 @@ func (kl *Kubelet) StartGarbageCollection() {
16091609
if prevImageGCFailed {
16101610
klog.ErrorS(err, "Image garbage collection failed multiple times in a row")
16111611
// Only create an event for repeated failures
1612-
kl.recorder.Eventf(kl.nodeRef, v1.EventTypeWarning, events.ImageGCFailed, err.Error())
1612+
kl.recorder.Eventf(kl.nodeRef, v1.EventTypeWarning, events.ImageGCFailed, "%s", err.Error())
16131613
} else {
16141614
klog.ErrorS(err, "Image garbage collection failed once. Stats initialization may not have completed yet")
16151615
}

0 commit comments

Comments
 (0)