Skip to content

Commit c93bdd0

Browse files
Mel Gormantorvalds
authored andcommitted
netvm: allow skb allocation to use PFMEMALLOC reserves
Change the skb allocation API to indicate RX usage and use this to fall back to the PFMEMALLOC reserve when needed. SKBs allocated from the reserve are tagged in skb->pfmemalloc. If an SKB is allocated from the reserve and the socket is later found to be unrelated to page reclaim, the packet is dropped so that the memory remains available for page reclaim. Network protocols are expected to recover from this packet loss. [a.p.zijlstra@chello.nl: Ideas taken from various patches] [davem@davemloft.net: Use static branches, coding style corrections] [sebastian@breakpoint.cc: Avoid unnecessary cast, fix !CONFIG_NET build] Signed-off-by: Mel Gorman <mgorman@suse.de> Acked-by: David S. Miller <davem@davemloft.net> Cc: Neil Brown <neilb@suse.de> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Mike Christie <michaelc@cs.wisc.edu> Cc: Eric B Munson <emunson@mgebm.net> Cc: Eric Dumazet <eric.dumazet@gmail.com> Cc: Sebastian Andrzej Siewior <sebastian@breakpoint.cc> Cc: Mel Gorman <mgorman@suse.de> Cc: Christoph Lameter <cl@linux.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent 7cb0240 commit c93bdd0

File tree

7 files changed

+142
-30
lines changed

7 files changed

+142
-30
lines changed

include/linux/gfp.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -385,6 +385,9 @@ void drain_local_pages(void *dummy);
385385
*/
386386
extern gfp_t gfp_allowed_mask;
387387

388+
/* Returns true if the gfp_mask allows use of ALLOC_NO_WATERMARK */
389+
bool gfp_pfmemalloc_allowed(gfp_t gfp_mask);
390+
388391
extern void pm_restrict_gfp_mask(void);
389392
extern void pm_restore_gfp_mask(void);
390393

include/linux/skbuff.h

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -462,6 +462,7 @@ struct sk_buff {
462462
#ifdef CONFIG_IPV6_NDISC_NODETYPE
463463
__u8 ndisc_nodetype:2;
464464
#endif
465+
__u8 pfmemalloc:1;
465466
__u8 ooo_okay:1;
466467
__u8 l4_rxhash:1;
467468
__u8 wifi_acked_valid:1;
@@ -502,6 +503,15 @@ struct sk_buff {
502503
#include <linux/slab.h>
503504

504505

506+
#define SKB_ALLOC_FCLONE 0x01
507+
#define SKB_ALLOC_RX 0x02
508+
509+
/* Returns true if the skb was allocated from PFMEMALLOC reserves */
510+
static inline bool skb_pfmemalloc(const struct sk_buff *skb)
511+
{
512+
return unlikely(skb->pfmemalloc);
513+
}
514+
505515
/*
506516
* skb might have a dst pointer attached, refcounted or not.
507517
* _skb_refdst low order bit is set if refcount was _not_ taken
@@ -565,7 +575,7 @@ extern bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
565575
bool *fragstolen, int *delta_truesize);
566576

567577
extern struct sk_buff *__alloc_skb(unsigned int size,
568-
gfp_t priority, int fclone, int node);
578+
gfp_t priority, int flags, int node);
569579
extern struct sk_buff *build_skb(void *data, unsigned int frag_size);
570580
static inline struct sk_buff *alloc_skb(unsigned int size,
571581
gfp_t priority)
@@ -576,7 +586,7 @@ static inline struct sk_buff *alloc_skb(unsigned int size,
576586
static inline struct sk_buff *alloc_skb_fclone(unsigned int size,
577587
gfp_t priority)
578588
{
579-
return __alloc_skb(size, priority, 1, NUMA_NO_NODE);
589+
return __alloc_skb(size, priority, SKB_ALLOC_FCLONE, NUMA_NO_NODE);
580590
}
581591

582592
extern void skb_recycle(struct sk_buff *skb);

include/net/sock.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -659,6 +659,21 @@ static inline bool sock_flag(const struct sock *sk, enum sock_flags flag)
659659
return test_bit(flag, &sk->sk_flags);
660660
}
661661

662+
#ifdef CONFIG_NET
663+
extern struct static_key memalloc_socks;
664+
static inline int sk_memalloc_socks(void)
665+
{
666+
return static_key_false(&memalloc_socks);
667+
}
668+
#else
669+
670+
static inline int sk_memalloc_socks(void)
671+
{
672+
return 0;
673+
}
674+
675+
#endif
676+
662677
static inline gfp_t sk_gfp_atomic(struct sock *sk, gfp_t gfp_mask)
663678
{
664679
return GFP_ATOMIC | (sk->sk_allocation & __GFP_MEMALLOC);

mm/internal.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -279,9 +279,6 @@ static inline struct page *mem_map_next(struct page *iter,
279279
#define __paginginit __init
280280
#endif
281281

282-
/* Returns true if the gfp_mask allows use of ALLOC_NO_WATERMARK */
283-
bool gfp_pfmemalloc_allowed(gfp_t gfp_mask);
284-
285282
/* Memory initialisation debug and verification */
286283
enum mminit_level {
287284
MMINIT_WARNING,

net/core/filter.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,14 @@ int sk_filter(struct sock *sk, struct sk_buff *skb)
8383
int err;
8484
struct sk_filter *filter;
8585

86+
/*
87+
* If the skb was allocated from pfmemalloc reserves, only
88+
* allow SOCK_MEMALLOC sockets to use it as this socket is
89+
* helping free memory
90+
*/
91+
if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC))
92+
return -ENOMEM;
93+
8694
err = security_sock_rcv_skb(sk, skb);
8795
if (err)
8896
return err;

net/core/skbuff.c

Lines changed: 99 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,43 @@ static void skb_under_panic(struct sk_buff *skb, int sz, void *here)
145145
BUG();
146146
}
147147

148+
149+
/*
150+
* kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells
151+
* the caller if emergency pfmemalloc reserves are being used. If it is and
152+
* the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves
153+
* may be used. Otherwise, the packet data may be discarded until enough
154+
* memory is free
155+
*/
156+
#define kmalloc_reserve(size, gfp, node, pfmemalloc) \
157+
__kmalloc_reserve(size, gfp, node, _RET_IP_, pfmemalloc)
158+
void *__kmalloc_reserve(size_t size, gfp_t flags, int node, unsigned long ip,
159+
bool *pfmemalloc)
160+
{
161+
void *obj;
162+
bool ret_pfmemalloc = false;
163+
164+
/*
165+
* Try a regular allocation, when that fails and we're not entitled
166+
* to the reserves, fail.
167+
*/
168+
obj = kmalloc_node_track_caller(size,
169+
flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
170+
node);
171+
if (obj || !(gfp_pfmemalloc_allowed(flags)))
172+
goto out;
173+
174+
/* Try again but now we are using pfmemalloc reserves */
175+
ret_pfmemalloc = true;
176+
obj = kmalloc_node_track_caller(size, flags, node);
177+
178+
out:
179+
if (pfmemalloc)
180+
*pfmemalloc = ret_pfmemalloc;
181+
182+
return obj;
183+
}
184+
148185
/* Allocate a new skbuff. We do this ourselves so we can fill in a few
149186
* 'private' fields and also do memory statistics to find all the
150187
* [BEEP] leaks.
@@ -155,8 +192,10 @@ static void skb_under_panic(struct sk_buff *skb, int sz, void *here)
155192
* __alloc_skb - allocate a network buffer
156193
* @size: size to allocate
157194
* @gfp_mask: allocation mask
158-
* @fclone: allocate from fclone cache instead of head cache
159-
* and allocate a cloned (child) skb
195+
* @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache
196+
* instead of head cache and allocate a cloned (child) skb.
197+
* If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
198+
* allocations in case the data is required for writeback
160199
* @node: numa node to allocate memory on
161200
*
162201
* Allocate a new &sk_buff. The returned buffer has no headroom and a
@@ -167,14 +206,19 @@ static void skb_under_panic(struct sk_buff *skb, int sz, void *here)
167206
* %GFP_ATOMIC.
168207
*/
169208
struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
170-
int fclone, int node)
209+
int flags, int node)
171210
{
172211
struct kmem_cache *cache;
173212
struct skb_shared_info *shinfo;
174213
struct sk_buff *skb;
175214
u8 *data;
215+
bool pfmemalloc;
176216

177-
cache = fclone ? skbuff_fclone_cache : skbuff_head_cache;
217+
cache = (flags & SKB_ALLOC_FCLONE)
218+
? skbuff_fclone_cache : skbuff_head_cache;
219+
220+
if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
221+
gfp_mask |= __GFP_MEMALLOC;
178222

179223
/* Get the HEAD */
180224
skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node);
@@ -189,7 +233,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
189233
*/
190234
size = SKB_DATA_ALIGN(size);
191235
size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
192-
data = kmalloc_node_track_caller(size, gfp_mask, node);
236+
data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc);
193237
if (!data)
194238
goto nodata;
195239
/* kmalloc(size) might give us more room than requested.
@@ -207,6 +251,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
207251
memset(skb, 0, offsetof(struct sk_buff, tail));
208252
/* Account for allocated memory : skb + skb->head */
209253
skb->truesize = SKB_TRUESIZE(size);
254+
skb->pfmemalloc = pfmemalloc;
210255
atomic_set(&skb->users, 1);
211256
skb->head = data;
212257
skb->data = data;
@@ -222,7 +267,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
222267
atomic_set(&shinfo->dataref, 1);
223268
kmemcheck_annotate_variable(shinfo->destructor_arg);
224269

225-
if (fclone) {
270+
if (flags & SKB_ALLOC_FCLONE) {
226271
struct sk_buff *child = skb + 1;
227272
atomic_t *fclone_ref = (atomic_t *) (child + 1);
228273

@@ -232,6 +277,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
232277
atomic_set(fclone_ref, 1);
233278

234279
child->fclone = SKB_FCLONE_UNAVAILABLE;
280+
child->pfmemalloc = pfmemalloc;
235281
}
236282
out:
237283
return skb;
@@ -302,14 +348,7 @@ static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache);
302348

303349
#define NETDEV_PAGECNT_BIAS (PAGE_SIZE / SMP_CACHE_BYTES)
304350

305-
/**
306-
* netdev_alloc_frag - allocate a page fragment
307-
* @fragsz: fragment size
308-
*
309-
* Allocates a frag from a page for receive buffer.
310-
* Uses GFP_ATOMIC allocations.
311-
*/
312-
void *netdev_alloc_frag(unsigned int fragsz)
351+
static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
313352
{
314353
struct netdev_alloc_cache *nc;
315354
void *data = NULL;
@@ -319,7 +358,7 @@ void *netdev_alloc_frag(unsigned int fragsz)
319358
nc = &__get_cpu_var(netdev_alloc_cache);
320359
if (unlikely(!nc->page)) {
321360
refill:
322-
nc->page = alloc_page(GFP_ATOMIC | __GFP_COLD);
361+
nc->page = alloc_page(gfp_mask);
323362
if (unlikely(!nc->page))
324363
goto end;
325364
recycle:
@@ -343,6 +382,18 @@ void *netdev_alloc_frag(unsigned int fragsz)
343382
local_irq_restore(flags);
344383
return data;
345384
}
385+
386+
/**
387+
* netdev_alloc_frag - allocate a page fragment
388+
* @fragsz: fragment size
389+
*
390+
* Allocates a frag from a page for receive buffer.
391+
* Uses GFP_ATOMIC allocations.
392+
*/
393+
void *netdev_alloc_frag(unsigned int fragsz)
394+
{
395+
return __netdev_alloc_frag(fragsz, GFP_ATOMIC | __GFP_COLD);
396+
}
346397
EXPORT_SYMBOL(netdev_alloc_frag);
347398

348399
/**
@@ -366,15 +417,21 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
366417
SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
367418

368419
if (fragsz <= PAGE_SIZE && !(gfp_mask & (__GFP_WAIT | GFP_DMA))) {
369-
void *data = netdev_alloc_frag(fragsz);
420+
void *data;
421+
422+
if (sk_memalloc_socks())
423+
gfp_mask |= __GFP_MEMALLOC;
424+
425+
data = __netdev_alloc_frag(fragsz, gfp_mask);
370426

371427
if (likely(data)) {
372428
skb = build_skb(data, fragsz);
373429
if (unlikely(!skb))
374430
put_page(virt_to_head_page(data));
375431
}
376432
} else {
377-
skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, NUMA_NO_NODE);
433+
skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask,
434+
SKB_ALLOC_RX, NUMA_NO_NODE);
378435
}
379436
if (likely(skb)) {
380437
skb_reserve(skb, NET_SKB_PAD);
@@ -656,6 +713,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
656713
#if IS_ENABLED(CONFIG_IP_VS)
657714
new->ipvs_property = old->ipvs_property;
658715
#endif
716+
new->pfmemalloc = old->pfmemalloc;
659717
new->protocol = old->protocol;
660718
new->mark = old->mark;
661719
new->skb_iif = old->skb_iif;
@@ -814,6 +872,9 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
814872
n->fclone = SKB_FCLONE_CLONE;
815873
atomic_inc(fclone_ref);
816874
} else {
875+
if (skb_pfmemalloc(skb))
876+
gfp_mask |= __GFP_MEMALLOC;
877+
817878
n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
818879
if (!n)
819880
return NULL;
@@ -850,6 +911,13 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
850911
skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type;
851912
}
852913

914+
static inline int skb_alloc_rx_flag(const struct sk_buff *skb)
915+
{
916+
if (skb_pfmemalloc(skb))
917+
return SKB_ALLOC_RX;
918+
return 0;
919+
}
920+
853921
/**
854922
* skb_copy - create private copy of an sk_buff
855923
* @skb: buffer to copy
@@ -871,7 +939,8 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
871939
{
872940
int headerlen = skb_headroom(skb);
873941
unsigned int size = skb_end_offset(skb) + skb->data_len;
874-
struct sk_buff *n = alloc_skb(size, gfp_mask);
942+
struct sk_buff *n = __alloc_skb(size, gfp_mask,
943+
skb_alloc_rx_flag(skb), NUMA_NO_NODE);
875944

876945
if (!n)
877946
return NULL;
@@ -906,7 +975,8 @@ EXPORT_SYMBOL(skb_copy);
906975
struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom, gfp_t gfp_mask)
907976
{
908977
unsigned int size = skb_headlen(skb) + headroom;
909-
struct sk_buff *n = alloc_skb(size, gfp_mask);
978+
struct sk_buff *n = __alloc_skb(size, gfp_mask,
979+
skb_alloc_rx_flag(skb), NUMA_NO_NODE);
910980

911981
if (!n)
912982
goto out;
@@ -979,8 +1049,10 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
9791049

9801050
size = SKB_DATA_ALIGN(size);
9811051

982-
data = kmalloc(size + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
983-
gfp_mask);
1052+
if (skb_pfmemalloc(skb))
1053+
gfp_mask |= __GFP_MEMALLOC;
1054+
data = kmalloc_reserve(size + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
1055+
gfp_mask, NUMA_NO_NODE, NULL);
9841056
if (!data)
9851057
goto nodata;
9861058
size = SKB_WITH_OVERHEAD(ksize(data));
@@ -1092,8 +1164,9 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
10921164
/*
10931165
* Allocate the copy buffer
10941166
*/
1095-
struct sk_buff *n = alloc_skb(newheadroom + skb->len + newtailroom,
1096-
gfp_mask);
1167+
struct sk_buff *n = __alloc_skb(newheadroom + skb->len + newtailroom,
1168+
gfp_mask, skb_alloc_rx_flag(skb),
1169+
NUMA_NO_NODE);
10971170
int oldheadroom = skb_headroom(skb);
10981171
int head_copy_len, head_copy_off;
10991172
int off;
@@ -2775,8 +2848,9 @@ struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features)
27752848
skb_release_head_state(nskb);
27762849
__skb_push(nskb, doffset);
27772850
} else {
2778-
nskb = alloc_skb(hsize + doffset + headroom,
2779-
GFP_ATOMIC);
2851+
nskb = __alloc_skb(hsize + doffset + headroom,
2852+
GFP_ATOMIC, skb_alloc_rx_flag(skb),
2853+
NUMA_NO_NODE);
27802854

27812855
if (unlikely(!nskb))
27822856
goto err;

net/core/sock.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -271,6 +271,9 @@ __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
271271
int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
272272
EXPORT_SYMBOL(sysctl_optmem_max);
273273

274+
struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
275+
EXPORT_SYMBOL_GPL(memalloc_socks);
276+
274277
/**
275278
* sk_set_memalloc - sets %SOCK_MEMALLOC
276279
* @sk: socket to set it on
@@ -283,13 +286,15 @@ void sk_set_memalloc(struct sock *sk)
283286
{
284287
sock_set_flag(sk, SOCK_MEMALLOC);
285288
sk->sk_allocation |= __GFP_MEMALLOC;
289+
static_key_slow_inc(&memalloc_socks);
286290
}
287291
EXPORT_SYMBOL_GPL(sk_set_memalloc);
288292

289293
void sk_clear_memalloc(struct sock *sk)
290294
{
291295
sock_reset_flag(sk, SOCK_MEMALLOC);
292296
sk->sk_allocation &= ~__GFP_MEMALLOC;
297+
static_key_slow_dec(&memalloc_socks);
293298
}
294299
EXPORT_SYMBOL_GPL(sk_clear_memalloc);
295300

0 commit comments

Comments
 (0)