Skip to content

Commit 24a65e6

Browse files
committed
drm/i915/hangcheck: Prevent long walks across full-ppgtt
With full-ppgtt, it takes the GPU an eon to traverse the entire 256PiB address space, causing a loop to be detected. Under the current scheme, if ACTHD walks off the end of a batch buffer and into an empty address space, we "never" detect the hang. If we always increment the score as the ACTHD is progressing then we will eventually timeout (after ~46.5s (31 * 1.5s) without advancing onto a new batch). To counter act this, increase the amount we reduce the score for good batches, so that only a series of almost-bad batches trigger a full reset. DoS detection suffers slightly but series of long running shader tests will benefit. Based on a patch from Chris Wilson. Testcase: igt/drv_hangman/hangcheck-unterminated Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com> Cc: Chris Wilson <chris@chris-wilson.co.uk> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> Link: http://patchwork.freedesktop.org/patch/msgid/1456930109-21532-1-git-send-email-mika.kuoppala@intel.com
1 parent d431440 commit 24a65e6

File tree

4 files changed

+7
-16
lines changed

4 files changed

+7
-16
lines changed

drivers/gpu/drm/i915/i915_debugfs.c

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1367,8 +1367,6 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
13671367
seq_printf(m, "\tACTHD = 0x%08llx [current 0x%08llx]\n",
13681368
(long long)ring->hangcheck.acthd,
13691369
(long long)acthd[i]);
1370-
seq_printf(m, "\tmax ACTHD = 0x%08llx\n",
1371-
(long long)ring->hangcheck.max_acthd);
13721370
seq_printf(m, "\tscore = %d\n", ring->hangcheck.score);
13731371
seq_printf(m, "\taction = %d\n", ring->hangcheck.action);
13741372

drivers/gpu/drm/i915/i915_gpu_error.c

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -230,8 +230,6 @@ static const char *hangcheck_action_to_str(enum intel_ring_hangcheck_action a)
230230
return "wait";
231231
case HANGCHECK_ACTIVE:
232232
return "active";
233-
case HANGCHECK_ACTIVE_LOOP:
234-
return "active (loop)";
235233
case HANGCHECK_KICK:
236234
return "kick";
237235
case HANGCHECK_HUNG:

drivers/gpu/drm/i915/i915_irq.c

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3001,12 +3001,7 @@ head_stuck(struct intel_engine_cs *ring, u64 acthd)
30013001
memset(ring->hangcheck.instdone, 0,
30023002
sizeof(ring->hangcheck.instdone));
30033003

3004-
if (acthd > ring->hangcheck.max_acthd) {
3005-
ring->hangcheck.max_acthd = acthd;
3006-
return HANGCHECK_ACTIVE;
3007-
}
3008-
3009-
return HANGCHECK_ACTIVE_LOOP;
3004+
return HANGCHECK_ACTIVE;
30103005
}
30113006

30123007
if (!subunits_stuck(ring))
@@ -3083,6 +3078,7 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
30833078
#define BUSY 1
30843079
#define KICK 5
30853080
#define HUNG 20
3081+
#define ACTIVE_DECAY 15
30863082

30873083
if (!i915.enable_hangcheck)
30883084
return;
@@ -3151,9 +3147,8 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
31513147
switch (ring->hangcheck.action) {
31523148
case HANGCHECK_IDLE:
31533149
case HANGCHECK_WAIT:
3154-
case HANGCHECK_ACTIVE:
31553150
break;
3156-
case HANGCHECK_ACTIVE_LOOP:
3151+
case HANGCHECK_ACTIVE:
31573152
ring->hangcheck.score += BUSY;
31583153
break;
31593154
case HANGCHECK_KICK:
@@ -3172,10 +3167,12 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
31723167
* attempts across multiple batches.
31733168
*/
31743169
if (ring->hangcheck.score > 0)
3175-
ring->hangcheck.score--;
3170+
ring->hangcheck.score -= ACTIVE_DECAY;
3171+
if (ring->hangcheck.score < 0)
3172+
ring->hangcheck.score = 0;
31763173

31773174
/* Clear head and subunit states on seqno movement */
3178-
ring->hangcheck.acthd = ring->hangcheck.max_acthd = 0;
3175+
ring->hangcheck.acthd = 0;
31793176

31803177
memset(ring->hangcheck.instdone, 0,
31813178
sizeof(ring->hangcheck.instdone));

drivers/gpu/drm/i915/intel_ringbuffer.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,6 @@ enum intel_ring_hangcheck_action {
7979
HANGCHECK_IDLE = 0,
8080
HANGCHECK_WAIT,
8181
HANGCHECK_ACTIVE,
82-
HANGCHECK_ACTIVE_LOOP,
8382
HANGCHECK_KICK,
8483
HANGCHECK_HUNG,
8584
};
@@ -88,7 +87,6 @@ enum intel_ring_hangcheck_action {
8887

8988
struct intel_ring_hangcheck {
9089
u64 acthd;
91-
u64 max_acthd;
9290
u32 seqno;
9391
int score;
9492
enum intel_ring_hangcheck_action action;

0 commit comments

Comments
 (0)