Skip to content

Commit 8ba306a

Browse files
committed
drm/i915: Share per-timeline HWSP using a slab suballocator
If we restrict ourselves to only using a cacheline for each timeline's HWSP (we could go smaller, but want to avoid needless polluting cachelines on different engines between different contexts), then we can suballocate a single 4k page into 64 different timeline HWSP. By treating each fresh allocation as a slab of 64 entries, we can keep it around for the next 64 allocation attempts until we need to refresh the slab cache. John Harrison noted the issue of fragmentation leading to the same worst case performance of one page per timeline as before, which can be mitigated by adopting a freelist. v2: Keep all partially allocated HWSP on a freelist This is still without migration, so it is possible for the system to end up with each timeline in its own page, but we ensure that no new allocation would needless allocate a fresh page! v3: Throw a selftest at the allocator to try and catch invalid cacheline reuse. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: John Harrison <John.C.Harrison@Intel.com> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20190128181812.22804-4-chris@chris-wilson.co.uk
1 parent 52954ed commit 8ba306a

File tree

6 files changed

+280
-28
lines changed

6 files changed

+280
-28
lines changed

drivers/gpu/drm/i915/i915_drv.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1978,6 +1978,10 @@ struct drm_i915_private {
19781978
struct i915_gt_timelines {
19791979
struct mutex mutex; /* protects list, tainted by GPU */
19801980
struct list_head list;
1981+
1982+
/* Pack multiple timelines' seqnos into the same page */
1983+
spinlock_t hwsp_lock;
1984+
struct list_head hwsp_free_list;
19811985
} timelines;
19821986

19831987
struct list_head active_rings;

drivers/gpu/drm/i915/i915_timeline.c

Lines changed: 103 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,18 @@
99
#include "i915_timeline.h"
1010
#include "i915_syncmap.h"
1111

12+
struct i915_timeline_hwsp {
13+
struct i915_vma *vma;
14+
struct list_head free_link;
15+
u64 free_bitmap;
16+
};
17+
18+
static inline struct i915_timeline_hwsp *
19+
i915_timeline_hwsp(const struct i915_timeline *tl)
20+
{
21+
return tl->hwsp_ggtt->private;
22+
}
23+
1224
static struct i915_vma *__hwsp_alloc(struct drm_i915_private *i915)
1325
{
1426
struct drm_i915_gem_object *obj;
@@ -27,28 +39,89 @@ static struct i915_vma *__hwsp_alloc(struct drm_i915_private *i915)
2739
return vma;
2840
}
2941

30-
static int hwsp_alloc(struct i915_timeline *timeline)
42+
static struct i915_vma *
43+
hwsp_alloc(struct i915_timeline *timeline, unsigned int *cacheline)
3144
{
32-
struct i915_vma *vma;
45+
struct drm_i915_private *i915 = timeline->i915;
46+
struct i915_gt_timelines *gt = &i915->gt.timelines;
47+
struct i915_timeline_hwsp *hwsp;
3348

34-
vma = __hwsp_alloc(timeline->i915);
35-
if (IS_ERR(vma))
36-
return PTR_ERR(vma);
49+
BUILD_BUG_ON(BITS_PER_TYPE(u64) * CACHELINE_BYTES > PAGE_SIZE);
3750

38-
timeline->hwsp_ggtt = vma;
39-
timeline->hwsp_offset = 0;
51+
spin_lock(&gt->hwsp_lock);
4052

41-
return 0;
53+
/* hwsp_free_list only contains HWSP that have available cachelines */
54+
hwsp = list_first_entry_or_null(&gt->hwsp_free_list,
55+
typeof(*hwsp), free_link);
56+
if (!hwsp) {
57+
struct i915_vma *vma;
58+
59+
spin_unlock(&gt->hwsp_lock);
60+
61+
hwsp = kmalloc(sizeof(*hwsp), GFP_KERNEL);
62+
if (!hwsp)
63+
return ERR_PTR(-ENOMEM);
64+
65+
vma = __hwsp_alloc(i915);
66+
if (IS_ERR(vma)) {
67+
kfree(hwsp);
68+
return vma;
69+
}
70+
71+
vma->private = hwsp;
72+
hwsp->vma = vma;
73+
hwsp->free_bitmap = ~0ull;
74+
75+
spin_lock(&gt->hwsp_lock);
76+
list_add(&hwsp->free_link, &gt->hwsp_free_list);
77+
}
78+
79+
GEM_BUG_ON(!hwsp->free_bitmap);
80+
*cacheline = __ffs64(hwsp->free_bitmap);
81+
hwsp->free_bitmap &= ~BIT_ULL(*cacheline);
82+
if (!hwsp->free_bitmap)
83+
list_del(&hwsp->free_link);
84+
85+
spin_unlock(&gt->hwsp_lock);
86+
87+
GEM_BUG_ON(hwsp->vma->private != hwsp);
88+
return hwsp->vma;
89+
}
90+
91+
static void hwsp_free(struct i915_timeline *timeline)
92+
{
93+
struct i915_gt_timelines *gt = &timeline->i915->gt.timelines;
94+
struct i915_timeline_hwsp *hwsp;
95+
96+
hwsp = i915_timeline_hwsp(timeline);
97+
if (!hwsp) /* leave global HWSP alone! */
98+
return;
99+
100+
spin_lock(&gt->hwsp_lock);
101+
102+
/* As a cacheline becomes available, publish the HWSP on the freelist */
103+
if (!hwsp->free_bitmap)
104+
list_add_tail(&hwsp->free_link, &gt->hwsp_free_list);
105+
106+
hwsp->free_bitmap |= BIT_ULL(timeline->hwsp_offset / CACHELINE_BYTES);
107+
108+
/* And if no one is left using it, give the page back to the system */
109+
if (hwsp->free_bitmap == ~0ull) {
110+
i915_vma_put(hwsp->vma);
111+
list_del(&hwsp->free_link);
112+
kfree(hwsp);
113+
}
114+
115+
spin_unlock(&gt->hwsp_lock);
42116
}
43117

44118
int i915_timeline_init(struct drm_i915_private *i915,
45119
struct i915_timeline *timeline,
46120
const char *name,
47-
struct i915_vma *global_hwsp)
121+
struct i915_vma *hwsp)
48122
{
49123
struct i915_gt_timelines *gt = &i915->gt.timelines;
50124
void *vaddr;
51-
int err;
52125

53126
/*
54127
* Ideally we want a set of engines on a single leaf as we expect
@@ -64,18 +137,22 @@ int i915_timeline_init(struct drm_i915_private *i915,
64137
timeline->name = name;
65138
timeline->pin_count = 0;
66139

67-
if (global_hwsp) {
68-
timeline->hwsp_ggtt = i915_vma_get(global_hwsp);
69-
timeline->hwsp_offset = I915_GEM_HWS_SEQNO_ADDR;
70-
} else {
71-
err = hwsp_alloc(timeline);
72-
if (err)
73-
return err;
140+
timeline->hwsp_offset = I915_GEM_HWS_SEQNO_ADDR;
141+
if (!hwsp) {
142+
unsigned int cacheline;
143+
144+
hwsp = hwsp_alloc(timeline, &cacheline);
145+
if (IS_ERR(hwsp))
146+
return PTR_ERR(hwsp);
147+
148+
timeline->hwsp_offset = cacheline * CACHELINE_BYTES;
74149
}
150+
timeline->hwsp_ggtt = i915_vma_get(hwsp);
75151

76-
vaddr = i915_gem_object_pin_map(timeline->hwsp_ggtt->obj, I915_MAP_WB);
152+
vaddr = i915_gem_object_pin_map(hwsp->obj, I915_MAP_WB);
77153
if (IS_ERR(vaddr)) {
78-
i915_vma_put(timeline->hwsp_ggtt);
154+
hwsp_free(timeline);
155+
i915_vma_put(hwsp);
79156
return PTR_ERR(vaddr);
80157
}
81158

@@ -105,6 +182,9 @@ void i915_timelines_init(struct drm_i915_private *i915)
105182
mutex_init(&gt->mutex);
106183
INIT_LIST_HEAD(&gt->list);
107184

185+
spin_lock_init(&gt->hwsp_lock);
186+
INIT_LIST_HEAD(&gt->hwsp_free_list);
187+
108188
/* via i915_gem_wait_for_idle() */
109189
i915_gem_shrinker_taints_mutex(i915, &gt->mutex);
110190
}
@@ -144,12 +224,13 @@ void i915_timeline_fini(struct i915_timeline *timeline)
144224
GEM_BUG_ON(timeline->pin_count);
145225
GEM_BUG_ON(!list_empty(&timeline->requests));
146226

147-
i915_syncmap_free(&timeline->sync);
148-
149227
mutex_lock(&gt->mutex);
150228
list_del(&timeline->link);
151229
mutex_unlock(&gt->mutex);
152230

231+
i915_syncmap_free(&timeline->sync);
232+
hwsp_free(timeline);
233+
153234
i915_gem_object_unpin_map(timeline->hwsp_ggtt->obj);
154235
i915_vma_put(timeline->hwsp_ggtt);
155236
}
@@ -226,6 +307,7 @@ void i915_timelines_fini(struct drm_i915_private *i915)
226307
struct i915_gt_timelines *gt = &i915->gt.timelines;
227308

228309
GEM_BUG_ON(!list_empty(&gt->list));
310+
GEM_BUG_ON(!list_empty(&gt->hwsp_free_list));
229311

230312
mutex_destroy(&gt->mutex);
231313
}

drivers/gpu/drm/i915/i915_timeline.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
#include "i915_utils.h"
3434

3535
struct i915_vma;
36+
struct i915_timeline_hwsp;
3637

3738
struct i915_timeline {
3839
u64 fence_context;

drivers/gpu/drm/i915/selftests/i915_random.c

Lines changed: 26 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -41,18 +41,37 @@ u64 i915_prandom_u64_state(struct rnd_state *rnd)
4141
return x;
4242
}
4343

44-
void i915_random_reorder(unsigned int *order, unsigned int count,
45-
struct rnd_state *state)
44+
void i915_prandom_shuffle(void *arr, size_t elsz, size_t count,
45+
struct rnd_state *state)
4646
{
47-
unsigned int i, j;
47+
char stack[128];
48+
49+
if (WARN_ON(elsz > sizeof(stack) || count > U32_MAX))
50+
return;
51+
52+
if (!elsz || !count)
53+
return;
54+
55+
/* Fisher-Yates shuffle courtesy of Knuth */
56+
while (--count) {
57+
size_t swp;
58+
59+
swp = i915_prandom_u32_max_state(count + 1, state);
60+
if (swp == count)
61+
continue;
4862

49-
for (i = 0; i < count; i++) {
50-
BUILD_BUG_ON(sizeof(unsigned int) > sizeof(u32));
51-
j = i915_prandom_u32_max_state(count, state);
52-
swap(order[i], order[j]);
63+
memcpy(stack, arr + count * elsz, elsz);
64+
memcpy(arr + count * elsz, arr + swp * elsz, elsz);
65+
memcpy(arr + swp * elsz, stack, elsz);
5366
}
5467
}
5568

69+
void i915_random_reorder(unsigned int *order, unsigned int count,
70+
struct rnd_state *state)
71+
{
72+
i915_prandom_shuffle(order, sizeof(*order), count, state);
73+
}
74+
5675
unsigned int *i915_random_order(unsigned int count, struct rnd_state *state)
5776
{
5877
unsigned int *order, i;

drivers/gpu/drm/i915/selftests/i915_random.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,4 +54,7 @@ void i915_random_reorder(unsigned int *order,
5454
unsigned int count,
5555
struct rnd_state *state);
5656

57+
void i915_prandom_shuffle(void *arr, size_t elsz, size_t count,
58+
struct rnd_state *state);
59+
5760
#endif /* !__I915_SELFTESTS_RANDOM_H__ */

0 commit comments

Comments
 (0)