@@ -52,7 +52,34 @@ static LIST_HEAD(rds_tcp_conn_list);
52
52
53
53
static struct kmem_cache * rds_tcp_conn_slab ;
54
54
55
- #define RDS_TCP_DEFAULT_BUFSIZE (128 * 1024)
55
+ static int rds_tcp_skbuf_handler (struct ctl_table * ctl , int write ,
56
+ void __user * buffer , size_t * lenp ,
57
+ loff_t * fpos );
58
+
59
+ int rds_tcp_min_sndbuf = SOCK_MIN_SNDBUF ;
60
+ int rds_tcp_min_rcvbuf = SOCK_MIN_RCVBUF ;
61
+
62
+ static struct ctl_table rds_tcp_sysctl_table [] = {
63
+ #define RDS_TCP_SNDBUF 0
64
+ {
65
+ .procname = "rds_tcp_sndbuf" ,
66
+ /* data is per-net pointer */
67
+ .maxlen = sizeof (int ),
68
+ .mode = 0644 ,
69
+ .proc_handler = rds_tcp_skbuf_handler ,
70
+ .extra1 = & rds_tcp_min_sndbuf ,
71
+ },
72
+ #define RDS_TCP_RCVBUF 1
73
+ {
74
+ .procname = "rds_tcp_rcvbuf" ,
75
+ /* data is per-net pointer */
76
+ .maxlen = sizeof (int ),
77
+ .mode = 0644 ,
78
+ .proc_handler = rds_tcp_skbuf_handler ,
79
+ .extra1 = & rds_tcp_min_rcvbuf ,
80
+ },
81
+ { }
82
+ };
56
83
57
84
/* doing it this way avoids calling tcp_sk() */
58
85
void rds_tcp_nonagle (struct socket * sock )
@@ -66,15 +93,6 @@ void rds_tcp_nonagle(struct socket *sock)
66
93
set_fs (oldfs );
67
94
}
68
95
69
- /* All module specific customizations to the RDS-TCP socket should be done in
70
- * rds_tcp_tune() and applied after socket creation. In general these
71
- * customizations should be tunable via module_param()
72
- */
73
- void rds_tcp_tune (struct socket * sock )
74
- {
75
- rds_tcp_nonagle (sock );
76
- }
77
-
78
96
u32 rds_tcp_snd_nxt (struct rds_tcp_connection * tc )
79
97
{
80
98
return tcp_sk (tc -> t_sock -> sk )-> snd_nxt ;
@@ -272,8 +290,34 @@ static int rds_tcp_netid;
272
290
struct rds_tcp_net {
273
291
struct socket * rds_tcp_listen_sock ;
274
292
struct work_struct rds_tcp_accept_w ;
293
+ struct ctl_table_header * rds_tcp_sysctl ;
294
+ struct ctl_table * ctl_table ;
295
+ int sndbuf_size ;
296
+ int rcvbuf_size ;
275
297
};
276
298
299
+ /* All module specific customizations to the RDS-TCP socket should be done in
300
+ * rds_tcp_tune() and applied after socket creation.
301
+ */
302
+ void rds_tcp_tune (struct socket * sock )
303
+ {
304
+ struct sock * sk = sock -> sk ;
305
+ struct net * net = sock_net (sk );
306
+ struct rds_tcp_net * rtn = net_generic (net , rds_tcp_netid );
307
+
308
+ rds_tcp_nonagle (sock );
309
+ lock_sock (sk );
310
+ if (rtn -> sndbuf_size > 0 ) {
311
+ sk -> sk_sndbuf = rtn -> sndbuf_size ;
312
+ sk -> sk_userlocks |= SOCK_SNDBUF_LOCK ;
313
+ }
314
+ if (rtn -> rcvbuf_size > 0 ) {
315
+ sk -> sk_sndbuf = rtn -> rcvbuf_size ;
316
+ sk -> sk_userlocks |= SOCK_RCVBUF_LOCK ;
317
+ }
318
+ release_sock (sk );
319
+ }
320
+
277
321
static void rds_tcp_accept_worker (struct work_struct * work )
278
322
{
279
323
struct rds_tcp_net * rtn = container_of (work ,
@@ -295,20 +339,60 @@ void rds_tcp_accept_work(struct sock *sk)
295
339
static __net_init int rds_tcp_init_net (struct net * net )
296
340
{
297
341
struct rds_tcp_net * rtn = net_generic (net , rds_tcp_netid );
342
+ struct ctl_table * tbl ;
343
+ int err = 0 ;
344
+
345
+ memset (rtn , 0 , sizeof (* rtn ));
298
346
347
+ /* {snd, rcv}buf_size default to 0, which implies we let the
348
+ * stack pick the value, and permit auto-tuning of buffer size.
349
+ */
350
+ if (net == & init_net ) {
351
+ tbl = rds_tcp_sysctl_table ;
352
+ } else {
353
+ tbl = kmemdup (rds_tcp_sysctl_table ,
354
+ sizeof (rds_tcp_sysctl_table ), GFP_KERNEL );
355
+ if (!tbl ) {
356
+ pr_warn ("could not set allocate syctl table\n" );
357
+ return - ENOMEM ;
358
+ }
359
+ rtn -> ctl_table = tbl ;
360
+ }
361
+ tbl [RDS_TCP_SNDBUF ].data = & rtn -> sndbuf_size ;
362
+ tbl [RDS_TCP_RCVBUF ].data = & rtn -> rcvbuf_size ;
363
+ rtn -> rds_tcp_sysctl = register_net_sysctl (net , "net/rds/tcp" , tbl );
364
+ if (!rtn -> rds_tcp_sysctl ) {
365
+ pr_warn ("could not register sysctl\n" );
366
+ err = - ENOMEM ;
367
+ goto fail ;
368
+ }
299
369
rtn -> rds_tcp_listen_sock = rds_tcp_listen_init (net );
300
370
if (!rtn -> rds_tcp_listen_sock ) {
301
371
pr_warn ("could not set up listen sock\n" );
302
- return - EAFNOSUPPORT ;
372
+ unregister_net_sysctl_table (rtn -> rds_tcp_sysctl );
373
+ rtn -> rds_tcp_sysctl = NULL ;
374
+ err = - EAFNOSUPPORT ;
375
+ goto fail ;
303
376
}
304
377
INIT_WORK (& rtn -> rds_tcp_accept_w , rds_tcp_accept_worker );
305
378
return 0 ;
379
+
380
+ fail :
381
+ if (net != & init_net )
382
+ kfree (tbl );
383
+ return err ;
306
384
}
307
385
308
386
static void __net_exit rds_tcp_exit_net (struct net * net )
309
387
{
310
388
struct rds_tcp_net * rtn = net_generic (net , rds_tcp_netid );
311
389
390
+ if (rtn -> rds_tcp_sysctl )
391
+ unregister_net_sysctl_table (rtn -> rds_tcp_sysctl );
392
+
393
+ if (net != & init_net && rtn -> ctl_table )
394
+ kfree (rtn -> ctl_table );
395
+
312
396
/* If rds_tcp_exit_net() is called as a result of netns deletion,
313
397
* the rds_tcp_kill_sock() device notifier would already have cleaned
314
398
* up the listen socket, thus there is no work to do in this function.
@@ -383,6 +467,45 @@ static struct notifier_block rds_tcp_dev_notifier = {
383
467
.priority = -10 , /* must be called after other network notifiers */
384
468
};
385
469
470
+ /* when sysctl is used to modify some kernel socket parameters,this
471
+ * function resets the RDS connections in that netns so that we can
472
+ * restart with new parameters. The assumption is that such reset
473
+ * events are few and far-between.
474
+ */
475
+ static void rds_tcp_sysctl_reset (struct net * net )
476
+ {
477
+ struct rds_tcp_connection * tc , * _tc ;
478
+
479
+ spin_lock_irq (& rds_tcp_conn_lock );
480
+ list_for_each_entry_safe (tc , _tc , & rds_tcp_conn_list , t_tcp_node ) {
481
+ struct net * c_net = read_pnet (& tc -> conn -> c_net );
482
+
483
+ if (net != c_net || !tc -> t_sock )
484
+ continue ;
485
+
486
+ rds_conn_drop (tc -> conn ); /* reconnect with new parameters */
487
+ }
488
+ spin_unlock_irq (& rds_tcp_conn_lock );
489
+ }
490
+
491
+ static int rds_tcp_skbuf_handler (struct ctl_table * ctl , int write ,
492
+ void __user * buffer , size_t * lenp ,
493
+ loff_t * fpos )
494
+ {
495
+ struct net * net = current -> nsproxy -> net_ns ;
496
+ int err ;
497
+
498
+ err = proc_dointvec_minmax (ctl , write , buffer , lenp , fpos );
499
+ if (err < 0 ) {
500
+ pr_warn ("Invalid input. Must be >= %d\n" ,
501
+ * (int * )(ctl -> extra1 ));
502
+ return err ;
503
+ }
504
+ if (write )
505
+ rds_tcp_sysctl_reset (net );
506
+ return 0 ;
507
+ }
508
+
386
509
static void rds_tcp_exit (void )
387
510
{
388
511
rds_info_deregister_func (RDS_INFO_TCP_SOCKETS , rds_tcp_tc_info );
0 commit comments