Skip to content

Commit 52c0fdb

Browse files
committed
drm/i915: Replace global breadcrumbs with per-context interrupt tracking
A few years ago, see commit 688e6c7 ("drm/i915: Slaughter the thundering i915_wait_request herd"), the issue of handling multiple clients waiting in parallel was brought to our attention. The requirement was that every client should be woken immediately upon its request being signaled, without incurring any cpu overhead. To handle certain fragility of our hw meant that we could not do a simple check inside the irq handler (some generations required almost unbounded delays before we could be sure of seqno coherency) and so request completion checking required delegation. Before commit 688e6c7, the solution was simple. Every client waiting on a request would be woken on every interrupt and each would do a heavyweight check to see if their request was complete. Commit 688e6c7 introduced an rbtree so that only the earliest waiter on the global timeline would woken, and would wake the next and so on. (Along with various complications to handle requests being reordered along the global timeline, and also a requirement for kthread to provide a delegate for fence signaling that had no process context.) The global rbtree depends on knowing the execution timeline (and global seqno). Without knowing that order, we must instead check all contexts queued to the HW to see which may have advanced. We trim that list by only checking queued contexts that are being waited on, but still we keep a list of all active contexts and their active signalers that we inspect from inside the irq handler. By moving the waiters onto the fence signal list, we can combine the client wakeup with the dma_fence signaling (a dramatic reduction in complexity, but does require the HW being coherent, the seqno must be visible from the cpu before the interrupt is raised - we keep a timer backup just in case). Having previously fixed all the issues with irq-seqno serialisation (by inserting delays onto the GPU after each request instead of random delays on the CPU after each interrupt), we can rely on the seqno state to perfom direct wakeups from the interrupt handler. This allows us to preserve our single context switch behaviour of the current routine, with the only downside that we lose the RT priority sorting of wakeups. In general, direct wakeup latency of multiple clients is about the same (about 10% better in most cases) with a reduction in total CPU time spent in the waiter (about 20-50% depending on gen). Average herd behaviour is improved, but at the cost of not delegating wakeups on task_prio. v2: Capture fence signaling state for error state and add comments to warm even the most cold of hearts. v3: Check if the request is still active before busywaiting v4: Reduce the amount of pointer misdirection with list_for_each_safe and using a local i915_request variable inside the loops v5: Add a missing pluralisation to a purely informative selftest message. References: 688e6c7 ("drm/i915: Slaughter the thundering i915_wait_request herd") Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20190129205230.19056-2-chris@chris-wilson.co.uk
1 parent 3df0bd1 commit 52c0fdb

23 files changed

+890
-1481
lines changed

drivers/gpu/drm/i915/i915_debugfs.c

Lines changed: 1 addition & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1315,29 +1315,16 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
13151315
seq_printf(m, "GT active? %s\n", yesno(dev_priv->gt.awake));
13161316

13171317
for_each_engine(engine, dev_priv, id) {
1318-
struct intel_breadcrumbs *b = &engine->breadcrumbs;
1319-
struct rb_node *rb;
1320-
13211318
seq_printf(m, "%s:\n", engine->name);
13221319
seq_printf(m, "\tseqno = %x [current %x, last %x], %dms ago\n",
13231320
engine->hangcheck.seqno, seqno[id],
13241321
intel_engine_last_submit(engine),
13251322
jiffies_to_msecs(jiffies -
13261323
engine->hangcheck.action_timestamp));
1327-
seq_printf(m, "\twaiters? %s, fake irq active? %s\n",
1328-
yesno(intel_engine_has_waiter(engine)),
1324+
seq_printf(m, "\tfake irq active? %s\n",
13291325
yesno(test_bit(engine->id,
13301326
&dev_priv->gpu_error.missed_irq_rings)));
13311327

1332-
spin_lock_irq(&b->rb_lock);
1333-
for (rb = rb_first(&b->waiters); rb; rb = rb_next(rb)) {
1334-
struct intel_wait *w = rb_entry(rb, typeof(*w), node);
1335-
1336-
seq_printf(m, "\t%s [%d] waiting for %x\n",
1337-
w->tsk->comm, w->tsk->pid, w->seqno);
1338-
}
1339-
spin_unlock_irq(&b->rb_lock);
1340-
13411328
seq_printf(m, "\tACTHD = 0x%08llx [current 0x%08llx]\n",
13421329
(long long)engine->hangcheck.acthd,
13431330
(long long)acthd[id]);
@@ -2021,18 +2008,6 @@ static int i915_swizzle_info(struct seq_file *m, void *data)
20212008
return 0;
20222009
}
20232010

2024-
static int count_irq_waiters(struct drm_i915_private *i915)
2025-
{
2026-
struct intel_engine_cs *engine;
2027-
enum intel_engine_id id;
2028-
int count = 0;
2029-
2030-
for_each_engine(engine, i915, id)
2031-
count += intel_engine_has_waiter(engine);
2032-
2033-
return count;
2034-
}
2035-
20362011
static const char *rps_power_to_str(unsigned int power)
20372012
{
20382013
static const char * const strings[] = {
@@ -2072,7 +2047,6 @@ static int i915_rps_boost_info(struct seq_file *m, void *data)
20722047
seq_printf(m, "RPS enabled? %d\n", rps->enabled);
20732048
seq_printf(m, "GPU busy? %s [%d requests]\n",
20742049
yesno(dev_priv->gt.awake), dev_priv->gt.active_requests);
2075-
seq_printf(m, "CPU waiting? %d\n", count_irq_waiters(dev_priv));
20762050
seq_printf(m, "Boosts outstanding? %d\n",
20772051
atomic_read(&rps->num_waiters));
20782052
seq_printf(m, "Interactive? %d\n", READ_ONCE(rps->power.interactive));

drivers/gpu/drm/i915/i915_gem_context.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -327,6 +327,9 @@ intel_context_init(struct intel_context *ce,
327327
struct intel_engine_cs *engine)
328328
{
329329
ce->gem_context = ctx;
330+
331+
INIT_LIST_HEAD(&ce->signal_link);
332+
INIT_LIST_HEAD(&ce->signals);
330333
}
331334

332335
static struct i915_gem_context *

drivers/gpu/drm/i915/i915_gem_context.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,8 @@ struct i915_gem_context {
164164
struct intel_context {
165165
struct i915_gem_context *gem_context;
166166
struct intel_engine_cs *active;
167+
struct list_head signal_link;
168+
struct list_head signals;
167169
struct i915_vma *state;
168170
struct intel_ring *ring;
169171
u32 *lrc_reg_state;

drivers/gpu/drm/i915/i915_gpu_error.c

Lines changed: 8 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -447,9 +447,14 @@ static void error_print_request(struct drm_i915_error_state_buf *m,
447447
if (!erq->seqno)
448448
return;
449449

450-
err_printf(m, "%s pid %d, ban score %d, seqno %8x:%08x, prio %d, emitted %dms, start %08x, head %08x, tail %08x\n",
450+
err_printf(m, "%s pid %d, ban score %d, seqno %8x:%08x%s%s, prio %d, emitted %dms, start %08x, head %08x, tail %08x\n",
451451
prefix, erq->pid, erq->ban_score,
452-
erq->context, erq->seqno, erq->sched_attr.priority,
452+
erq->context, erq->seqno,
453+
test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
454+
&erq->flags) ? "!" : "",
455+
test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
456+
&erq->flags) ? "+" : "",
457+
erq->sched_attr.priority,
453458
jiffies_to_msecs(erq->jiffies - epoch),
454459
erq->start, erq->head, erq->tail);
455460
}
@@ -530,7 +535,6 @@ static void error_print_engine(struct drm_i915_error_state_buf *m,
530535
}
531536
err_printf(m, " seqno: 0x%08x\n", ee->seqno);
532537
err_printf(m, " last_seqno: 0x%08x\n", ee->last_seqno);
533-
err_printf(m, " waiting: %s\n", yesno(ee->waiting));
534538
err_printf(m, " ring->head: 0x%08x\n", ee->cpu_ring_head);
535539
err_printf(m, " ring->tail: 0x%08x\n", ee->cpu_ring_tail);
536540
err_printf(m, " hangcheck timestamp: %dms (%lu%s)\n",
@@ -804,21 +808,6 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
804808
error->epoch);
805809
}
806810

807-
if (IS_ERR(ee->waiters)) {
808-
err_printf(m, "%s --- ? waiters [unable to acquire spinlock]\n",
809-
m->i915->engine[i]->name);
810-
} else if (ee->num_waiters) {
811-
err_printf(m, "%s --- %d waiters\n",
812-
m->i915->engine[i]->name,
813-
ee->num_waiters);
814-
for (j = 0; j < ee->num_waiters; j++) {
815-
err_printf(m, " seqno 0x%08x for %s [%d]\n",
816-
ee->waiters[j].seqno,
817-
ee->waiters[j].comm,
818-
ee->waiters[j].pid);
819-
}
820-
}
821-
822811
print_error_obj(m, m->i915->engine[i],
823812
"ringbuffer", ee->ringbuffer);
824813

@@ -1000,8 +989,6 @@ void __i915_gpu_state_free(struct kref *error_ref)
1000989
i915_error_object_free(ee->wa_ctx);
1001990

1002991
kfree(ee->requests);
1003-
if (!IS_ERR_OR_NULL(ee->waiters))
1004-
kfree(ee->waiters);
1005992
}
1006993

1007994
for (i = 0; i < ARRAY_SIZE(error->active_bo); i++)
@@ -1205,59 +1192,6 @@ static void gen6_record_semaphore_state(struct intel_engine_cs *engine,
12051192
I915_READ(RING_SYNC_2(engine->mmio_base));
12061193
}
12071194

1208-
static void error_record_engine_waiters(struct intel_engine_cs *engine,
1209-
struct drm_i915_error_engine *ee)
1210-
{
1211-
struct intel_breadcrumbs *b = &engine->breadcrumbs;
1212-
struct drm_i915_error_waiter *waiter;
1213-
struct rb_node *rb;
1214-
int count;
1215-
1216-
ee->num_waiters = 0;
1217-
ee->waiters = NULL;
1218-
1219-
if (RB_EMPTY_ROOT(&b->waiters))
1220-
return;
1221-
1222-
if (!spin_trylock_irq(&b->rb_lock)) {
1223-
ee->waiters = ERR_PTR(-EDEADLK);
1224-
return;
1225-
}
1226-
1227-
count = 0;
1228-
for (rb = rb_first(&b->waiters); rb != NULL; rb = rb_next(rb))
1229-
count++;
1230-
spin_unlock_irq(&b->rb_lock);
1231-
1232-
waiter = NULL;
1233-
if (count)
1234-
waiter = kmalloc_array(count,
1235-
sizeof(struct drm_i915_error_waiter),
1236-
GFP_ATOMIC);
1237-
if (!waiter)
1238-
return;
1239-
1240-
if (!spin_trylock_irq(&b->rb_lock)) {
1241-
kfree(waiter);
1242-
ee->waiters = ERR_PTR(-EDEADLK);
1243-
return;
1244-
}
1245-
1246-
ee->waiters = waiter;
1247-
for (rb = rb_first(&b->waiters); rb; rb = rb_next(rb)) {
1248-
struct intel_wait *w = rb_entry(rb, typeof(*w), node);
1249-
1250-
strcpy(waiter->comm, w->tsk->comm);
1251-
waiter->pid = w->tsk->pid;
1252-
waiter->seqno = w->seqno;
1253-
waiter++;
1254-
1255-
if (++ee->num_waiters == count)
1256-
break;
1257-
}
1258-
spin_unlock_irq(&b->rb_lock);
1259-
}
1260-
12611195
static void error_record_engine_registers(struct i915_gpu_state *error,
12621196
struct intel_engine_cs *engine,
12631197
struct drm_i915_error_engine *ee)
@@ -1293,7 +1227,6 @@ static void error_record_engine_registers(struct i915_gpu_state *error,
12931227

12941228
intel_engine_get_instdone(engine, &ee->instdone);
12951229

1296-
ee->waiting = intel_engine_has_waiter(engine);
12971230
ee->instpm = I915_READ(RING_INSTPM(engine->mmio_base));
12981231
ee->acthd = intel_engine_get_active_head(engine);
12991232
ee->seqno = intel_engine_get_seqno(engine);
@@ -1367,6 +1300,7 @@ static void record_request(struct i915_request *request,
13671300
{
13681301
struct i915_gem_context *ctx = request->gem_context;
13691302

1303+
erq->flags = request->fence.flags;
13701304
erq->context = ctx->hw_id;
13711305
erq->sched_attr = request->sched.attr;
13721306
erq->ban_score = atomic_read(&ctx->ban_score);
@@ -1542,7 +1476,6 @@ static void gem_record_rings(struct i915_gpu_state *error)
15421476
ee->engine_id = i;
15431477

15441478
error_record_engine_registers(error, engine, ee);
1545-
error_record_engine_waiters(engine, ee);
15461479
error_record_engine_execlists(engine, ee);
15471480

15481481
request = i915_gem_find_active_request(engine);

drivers/gpu/drm/i915/i915_gpu_error.h

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -82,8 +82,6 @@ struct i915_gpu_state {
8282
int engine_id;
8383
/* Software tracked state */
8484
bool idle;
85-
bool waiting;
86-
int num_waiters;
8785
unsigned long hangcheck_timestamp;
8886
struct i915_address_space *vm;
8987
int num_requests;
@@ -147,6 +145,7 @@ struct i915_gpu_state {
147145
struct drm_i915_error_object *default_state;
148146

149147
struct drm_i915_error_request {
148+
unsigned long flags;
150149
long jiffies;
151150
pid_t pid;
152151
u32 context;
@@ -159,12 +158,6 @@ struct i915_gpu_state {
159158
} *requests, execlist[EXECLIST_MAX_PORTS];
160159
unsigned int num_ports;
161160

162-
struct drm_i915_error_waiter {
163-
char comm[TASK_COMM_LEN];
164-
pid_t pid;
165-
u32 seqno;
166-
} *waiters;
167-
168161
struct {
169162
u32 gfx_mode;
170163
union {

drivers/gpu/drm/i915/i915_irq.c

Lines changed: 11 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -1169,66 +1169,6 @@ static void ironlake_rps_change_irq_handler(struct drm_i915_private *dev_priv)
11691169
return;
11701170
}
11711171

1172-
static void notify_ring(struct intel_engine_cs *engine)
1173-
{
1174-
const u32 seqno = intel_engine_get_seqno(engine);
1175-
struct i915_request *rq = NULL;
1176-
struct task_struct *tsk = NULL;
1177-
struct intel_wait *wait;
1178-
1179-
if (unlikely(!engine->breadcrumbs.irq_armed))
1180-
return;
1181-
1182-
rcu_read_lock();
1183-
1184-
spin_lock(&engine->breadcrumbs.irq_lock);
1185-
wait = engine->breadcrumbs.irq_wait;
1186-
if (wait) {
1187-
/*
1188-
* We use a callback from the dma-fence to submit
1189-
* requests after waiting on our own requests. To
1190-
* ensure minimum delay in queuing the next request to
1191-
* hardware, signal the fence now rather than wait for
1192-
* the signaler to be woken up. We still wake up the
1193-
* waiter in order to handle the irq-seqno coherency
1194-
* issues (we may receive the interrupt before the
1195-
* seqno is written, see __i915_request_irq_complete())
1196-
* and to handle coalescing of multiple seqno updates
1197-
* and many waiters.
1198-
*/
1199-
if (i915_seqno_passed(seqno, wait->seqno)) {
1200-
struct i915_request *waiter = wait->request;
1201-
1202-
if (waiter &&
1203-
!i915_request_signaled(waiter) &&
1204-
intel_wait_check_request(wait, waiter))
1205-
rq = i915_request_get(waiter);
1206-
1207-
tsk = wait->tsk;
1208-
}
1209-
1210-
engine->breadcrumbs.irq_count++;
1211-
} else {
1212-
if (engine->breadcrumbs.irq_armed)
1213-
__intel_engine_disarm_breadcrumbs(engine);
1214-
}
1215-
spin_unlock(&engine->breadcrumbs.irq_lock);
1216-
1217-
if (rq) {
1218-
spin_lock(&rq->lock);
1219-
dma_fence_signal_locked(&rq->fence);
1220-
GEM_BUG_ON(!i915_request_completed(rq));
1221-
spin_unlock(&rq->lock);
1222-
1223-
i915_request_put(rq);
1224-
}
1225-
1226-
if (tsk && tsk->state & TASK_NORMAL)
1227-
wake_up_process(tsk);
1228-
1229-
rcu_read_unlock();
1230-
}
1231-
12321172
static void vlv_c0_read(struct drm_i915_private *dev_priv,
12331173
struct intel_rps_ei *ei)
12341174
{
@@ -1473,20 +1413,20 @@ static void ilk_gt_irq_handler(struct drm_i915_private *dev_priv,
14731413
u32 gt_iir)
14741414
{
14751415
if (gt_iir & GT_RENDER_USER_INTERRUPT)
1476-
notify_ring(dev_priv->engine[RCS]);
1416+
intel_engine_breadcrumbs_irq(dev_priv->engine[RCS]);
14771417
if (gt_iir & ILK_BSD_USER_INTERRUPT)
1478-
notify_ring(dev_priv->engine[VCS]);
1418+
intel_engine_breadcrumbs_irq(dev_priv->engine[VCS]);
14791419
}
14801420

14811421
static void snb_gt_irq_handler(struct drm_i915_private *dev_priv,
14821422
u32 gt_iir)
14831423
{
14841424
if (gt_iir & GT_RENDER_USER_INTERRUPT)
1485-
notify_ring(dev_priv->engine[RCS]);
1425+
intel_engine_breadcrumbs_irq(dev_priv->engine[RCS]);
14861426
if (gt_iir & GT_BSD_USER_INTERRUPT)
1487-
notify_ring(dev_priv->engine[VCS]);
1427+
intel_engine_breadcrumbs_irq(dev_priv->engine[VCS]);
14881428
if (gt_iir & GT_BLT_USER_INTERRUPT)
1489-
notify_ring(dev_priv->engine[BCS]);
1429+
intel_engine_breadcrumbs_irq(dev_priv->engine[BCS]);
14901430

14911431
if (gt_iir & (GT_BLT_CS_ERROR_INTERRUPT |
14921432
GT_BSD_CS_ERROR_INTERRUPT |
@@ -1506,7 +1446,7 @@ gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir)
15061446
tasklet = true;
15071447

15081448
if (iir & GT_RENDER_USER_INTERRUPT) {
1509-
notify_ring(engine);
1449+
intel_engine_breadcrumbs_irq(engine);
15101450
tasklet |= USES_GUC_SUBMISSION(engine->i915);
15111451
}
15121452

@@ -1852,7 +1792,7 @@ static void gen6_rps_irq_handler(struct drm_i915_private *dev_priv, u32 pm_iir)
18521792

18531793
if (HAS_VEBOX(dev_priv)) {
18541794
if (pm_iir & PM_VEBOX_USER_INTERRUPT)
1855-
notify_ring(dev_priv->engine[VECS]);
1795+
intel_engine_breadcrumbs_irq(dev_priv->engine[VECS]);
18561796

18571797
if (pm_iir & PM_VEBOX_CS_ERROR_INTERRUPT)
18581798
DRM_DEBUG("Command parser error, pm_iir 0x%08x\n", pm_iir);
@@ -4276,7 +4216,7 @@ static irqreturn_t i8xx_irq_handler(int irq, void *arg)
42764216
I915_WRITE16(IIR, iir);
42774217

42784218
if (iir & I915_USER_INTERRUPT)
4279-
notify_ring(dev_priv->engine[RCS]);
4219+
intel_engine_breadcrumbs_irq(dev_priv->engine[RCS]);
42804220

42814221
if (iir & I915_MASTER_ERROR_INTERRUPT)
42824222
i8xx_error_irq_handler(dev_priv, eir, eir_stuck);
@@ -4384,7 +4324,7 @@ static irqreturn_t i915_irq_handler(int irq, void *arg)
43844324
I915_WRITE(IIR, iir);
43854325

43864326
if (iir & I915_USER_INTERRUPT)
4387-
notify_ring(dev_priv->engine[RCS]);
4327+
intel_engine_breadcrumbs_irq(dev_priv->engine[RCS]);
43884328

43894329
if (iir & I915_MASTER_ERROR_INTERRUPT)
43904330
i9xx_error_irq_handler(dev_priv, eir, eir_stuck);
@@ -4529,10 +4469,10 @@ static irqreturn_t i965_irq_handler(int irq, void *arg)
45294469
I915_WRITE(IIR, iir);
45304470

45314471
if (iir & I915_USER_INTERRUPT)
4532-
notify_ring(dev_priv->engine[RCS]);
4472+
intel_engine_breadcrumbs_irq(dev_priv->engine[RCS]);
45334473

45344474
if (iir & I915_BSD_USER_INTERRUPT)
4535-
notify_ring(dev_priv->engine[VCS]);
4475+
intel_engine_breadcrumbs_irq(dev_priv->engine[VCS]);
45364476

45374477
if (iir & I915_MASTER_ERROR_INTERRUPT)
45384478
i9xx_error_irq_handler(dev_priv, eir, eir_stuck);

0 commit comments

Comments
 (0)