@@ -54,6 +54,35 @@ static struct kmem_cache *rds_tcp_conn_slab;
54
54
55
55
#define RDS_TCP_DEFAULT_BUFSIZE (128 * 1024)
56
56
57
+ static int rds_tcp_skbuf_handler (struct ctl_table * ctl , int write ,
58
+ void __user * buffer , size_t * lenp ,
59
+ loff_t * fpos );
60
+
61
+ int rds_tcp_min_sndbuf = SOCK_MIN_SNDBUF ;
62
+ int rds_tcp_min_rcvbuf = SOCK_MIN_RCVBUF ;
63
+
64
+ static struct ctl_table rds_tcp_sysctl_table [] = {
65
+ #define RDS_TCP_SNDBUF 0
66
+ {
67
+ .procname = "rds_tcp_sndbuf" ,
68
+ /* data is per-net pointer */
69
+ .maxlen = sizeof (int ),
70
+ .mode = 0644 ,
71
+ .proc_handler = rds_tcp_skbuf_handler ,
72
+ .extra1 = & rds_tcp_min_sndbuf ,
73
+ },
74
+ #define RDS_TCP_RCVBUF 1
75
+ {
76
+ .procname = "rds_tcp_rcvbuf" ,
77
+ /* data is per-net pointer */
78
+ .maxlen = sizeof (int ),
79
+ .mode = 0644 ,
80
+ .proc_handler = rds_tcp_skbuf_handler ,
81
+ .extra1 = & rds_tcp_min_rcvbuf ,
82
+ },
83
+ { }
84
+ };
85
+
57
86
/* doing it this way avoids calling tcp_sk() */
58
87
void rds_tcp_nonagle (struct socket * sock )
59
88
{
@@ -66,15 +95,6 @@ void rds_tcp_nonagle(struct socket *sock)
66
95
set_fs (oldfs );
67
96
}
68
97
69
- /* All module specific customizations to the RDS-TCP socket should be done in
70
- * rds_tcp_tune() and applied after socket creation. In general these
71
- * customizations should be tunable via module_param()
72
- */
73
- void rds_tcp_tune (struct socket * sock )
74
- {
75
- rds_tcp_nonagle (sock );
76
- }
77
-
78
98
u32 rds_tcp_snd_nxt (struct rds_tcp_connection * tc )
79
99
{
80
100
return tcp_sk (tc -> t_sock -> sk )-> snd_nxt ;
@@ -272,8 +292,34 @@ static int rds_tcp_netid;
272
292
struct rds_tcp_net {
273
293
struct socket * rds_tcp_listen_sock ;
274
294
struct work_struct rds_tcp_accept_w ;
295
+ struct ctl_table_header * rds_tcp_sysctl ;
296
+ struct ctl_table * ctl_table ;
297
+ int sndbuf_size ;
298
+ int rcvbuf_size ;
275
299
};
276
300
301
+ /* All module specific customizations to the RDS-TCP socket should be done in
302
+ * rds_tcp_tune() and applied after socket creation.
303
+ */
304
+ void rds_tcp_tune (struct socket * sock )
305
+ {
306
+ struct sock * sk = sock -> sk ;
307
+ struct net * net = sock_net (sk );
308
+ struct rds_tcp_net * rtn = net_generic (net , rds_tcp_netid );
309
+
310
+ rds_tcp_nonagle (sock );
311
+ lock_sock (sk );
312
+ if (rtn -> sndbuf_size > 0 ) {
313
+ sk -> sk_sndbuf = rtn -> sndbuf_size ;
314
+ sk -> sk_userlocks |= SOCK_SNDBUF_LOCK ;
315
+ }
316
+ if (rtn -> rcvbuf_size > 0 ) {
317
+ sk -> sk_sndbuf = rtn -> rcvbuf_size ;
318
+ sk -> sk_userlocks |= SOCK_RCVBUF_LOCK ;
319
+ }
320
+ release_sock (sk );
321
+ }
322
+
277
323
static void rds_tcp_accept_worker (struct work_struct * work )
278
324
{
279
325
struct rds_tcp_net * rtn = container_of (work ,
@@ -295,20 +341,60 @@ void rds_tcp_accept_work(struct sock *sk)
295
341
static __net_init int rds_tcp_init_net (struct net * net )
296
342
{
297
343
struct rds_tcp_net * rtn = net_generic (net , rds_tcp_netid );
344
+ struct ctl_table * tbl ;
345
+ int err = 0 ;
298
346
347
+ memset (rtn , 0 , sizeof (* rtn ));
348
+
349
+ /* {snd, rcv}buf_size default to 0, which implies we let the
350
+ * stack pick the value, and permit auto-tuning of buffer size.
351
+ */
352
+ if (net == & init_net ) {
353
+ tbl = rds_tcp_sysctl_table ;
354
+ } else {
355
+ tbl = kmemdup (rds_tcp_sysctl_table ,
356
+ sizeof (rds_tcp_sysctl_table ), GFP_KERNEL );
357
+ if (!tbl ) {
358
+ pr_warn ("could not set allocate syctl table\n" );
359
+ return - ENOMEM ;
360
+ }
361
+ rtn -> ctl_table = tbl ;
362
+ }
363
+ tbl [RDS_TCP_SNDBUF ].data = & rtn -> sndbuf_size ;
364
+ tbl [RDS_TCP_RCVBUF ].data = & rtn -> rcvbuf_size ;
365
+ rtn -> rds_tcp_sysctl = register_net_sysctl (net , "net/rds/tcp" , tbl );
366
+ if (!rtn -> rds_tcp_sysctl ) {
367
+ pr_warn ("could not register sysctl\n" );
368
+ err = - ENOMEM ;
369
+ goto fail ;
370
+ }
299
371
rtn -> rds_tcp_listen_sock = rds_tcp_listen_init (net );
300
372
if (!rtn -> rds_tcp_listen_sock ) {
301
373
pr_warn ("could not set up listen sock\n" );
302
- return - EAFNOSUPPORT ;
374
+ unregister_net_sysctl_table (rtn -> rds_tcp_sysctl );
375
+ rtn -> rds_tcp_sysctl = NULL ;
376
+ err = - EAFNOSUPPORT ;
377
+ goto fail ;
303
378
}
304
379
INIT_WORK (& rtn -> rds_tcp_accept_w , rds_tcp_accept_worker );
305
380
return 0 ;
381
+
382
+ fail :
383
+ if (net != & init_net )
384
+ kfree (tbl );
385
+ return err ;
306
386
}
307
387
308
388
static void __net_exit rds_tcp_exit_net (struct net * net )
309
389
{
310
390
struct rds_tcp_net * rtn = net_generic (net , rds_tcp_netid );
311
391
392
+ if (rtn -> rds_tcp_sysctl )
393
+ unregister_net_sysctl_table (rtn -> rds_tcp_sysctl );
394
+
395
+ if (net != & init_net && rtn -> ctl_table )
396
+ kfree (rtn -> ctl_table );
397
+
312
398
/* If rds_tcp_exit_net() is called as a result of netns deletion,
313
399
* the rds_tcp_kill_sock() device notifier would already have cleaned
314
400
* up the listen socket, thus there is no work to do in this function.
@@ -383,6 +469,45 @@ static struct notifier_block rds_tcp_dev_notifier = {
383
469
.priority = -10 , /* must be called after other network notifiers */
384
470
};
385
471
472
+ /* when sysctl is used to modify some kernel socket parameters,this
473
+ * function resets the RDS connections in that netns so that we can
474
+ * restart with new parameters. The assumption is that such reset
475
+ * events are few and far-between.
476
+ */
477
+ static void rds_tcp_sysctl_reset (struct net * net )
478
+ {
479
+ struct rds_tcp_connection * tc , * _tc ;
480
+
481
+ spin_lock_irq (& rds_tcp_conn_lock );
482
+ list_for_each_entry_safe (tc , _tc , & rds_tcp_conn_list , t_tcp_node ) {
483
+ struct net * c_net = read_pnet (& tc -> conn -> c_net );
484
+
485
+ if (net != c_net || !tc -> t_sock )
486
+ continue ;
487
+
488
+ rds_conn_drop (tc -> conn ); /* reconnect with new parameters */
489
+ }
490
+ spin_unlock_irq (& rds_tcp_conn_lock );
491
+ }
492
+
493
+ static int rds_tcp_skbuf_handler (struct ctl_table * ctl , int write ,
494
+ void __user * buffer , size_t * lenp ,
495
+ loff_t * fpos )
496
+ {
497
+ struct net * net = current -> nsproxy -> net_ns ;
498
+ int err ;
499
+
500
+ err = proc_dointvec_minmax (ctl , write , buffer , lenp , fpos );
501
+ if (err < 0 ) {
502
+ pr_warn ("Invalid input. Must be >= %d\n" ,
503
+ * (int * )(ctl -> extra1 ));
504
+ return err ;
505
+ }
506
+ if (write )
507
+ rds_tcp_sysctl_reset (net );
508
+ return 0 ;
509
+ }
510
+
386
511
static void rds_tcp_exit (void )
387
512
{
388
513
rds_info_deregister_func (RDS_INFO_TCP_SOCKETS , rds_tcp_tc_info );
0 commit comments