[go: nahoru, domu]

o2iblnd_modparams.c revision a7f24447d1362353e36f2b1effca538912fbcda8
1/*
2 * GPL HEADER START
3 *
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
19 *
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
22 * have any questions.
23 *
24 * GPL HEADER END
25 */
26/*
27 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
29 *
30 * Copyright (c) 2012, Intel Corporation.
31 */
32/*
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
35 *
36 * lnet/klnds/o2iblnd/o2iblnd_modparams.c
37 *
38 * Author: Eric Barton <eric@bartonsoftware.com>
39 */
40
41#include "o2iblnd.h"
42
43static int service = 987;
44CFS_MODULE_PARM(service, "i", int, 0444,
45		"service number (within RDMA_PS_TCP)");
46
47static int cksum = 0;
48CFS_MODULE_PARM(cksum, "i", int, 0644,
49		"set non-zero to enable message (not RDMA) checksums");
50
51static int timeout = 50;
52CFS_MODULE_PARM(timeout, "i", int, 0644,
53		"timeout (seconds)");
54
55/* Number of threads in each scheduler pool which is percpt,
56 * we will estimate reasonable value based on CPUs if it's set to zero. */
57static int nscheds;
58CFS_MODULE_PARM(nscheds, "i", int, 0444,
59		"number of threads in each scheduler pool");
60
61/* NB: this value is shared by all CPTs, it can grow at runtime */
62static int ntx = 512;
63CFS_MODULE_PARM(ntx, "i", int, 0444,
64		"# of message descriptors allocated for each pool");
65
66/* NB: this value is shared by all CPTs */
67static int credits = 256;
68CFS_MODULE_PARM(credits, "i", int, 0444,
69		"# concurrent sends");
70
71static int peer_credits = 8;
72CFS_MODULE_PARM(peer_credits, "i", int, 0444,
73		"# concurrent sends to 1 peer");
74
75static int peer_credits_hiw = 0;
76CFS_MODULE_PARM(peer_credits_hiw, "i", int, 0444,
77		"when eagerly to return credits");
78
79static int peer_buffer_credits = 0;
80CFS_MODULE_PARM(peer_buffer_credits, "i", int, 0444,
81		"# per-peer router buffer credits");
82
83static int peer_timeout = 180;
84CFS_MODULE_PARM(peer_timeout, "i", int, 0444,
85		"Seconds without aliveness news to declare peer dead (<=0 to disable)");
86
87static char *ipif_name = "ib0";
88CFS_MODULE_PARM(ipif_name, "s", charp, 0444,
89		"IPoIB interface name");
90
91static int retry_count = 5;
92CFS_MODULE_PARM(retry_count, "i", int, 0644,
93		"Retransmissions when no ACK received");
94
95static int rnr_retry_count = 6;
96CFS_MODULE_PARM(rnr_retry_count, "i", int, 0644,
97		"RNR retransmissions");
98
99static int keepalive = 100;
100CFS_MODULE_PARM(keepalive, "i", int, 0644,
101		"Idle time in seconds before sending a keepalive");
102
103static int ib_mtu = 0;
104CFS_MODULE_PARM(ib_mtu, "i", int, 0444,
105		"IB MTU 256/512/1024/2048/4096");
106
107static int concurrent_sends = 0;
108CFS_MODULE_PARM(concurrent_sends, "i", int, 0444,
109		"send work-queue sizing");
110
111static int map_on_demand = 0;
112CFS_MODULE_PARM(map_on_demand, "i", int, 0444,
113		"map on demand");
114
115/* NB: this value is shared by all CPTs, it can grow at runtime */
116static int fmr_pool_size = 512;
117CFS_MODULE_PARM(fmr_pool_size, "i", int, 0444,
118		"size of fmr pool on each CPT (>= ntx / 4)");
119
120/* NB: this value is shared by all CPTs, it can grow at runtime */
121static int fmr_flush_trigger = 384;
122CFS_MODULE_PARM(fmr_flush_trigger, "i", int, 0444,
123		"# dirty FMRs that triggers pool flush");
124
125static int fmr_cache = 1;
126CFS_MODULE_PARM(fmr_cache, "i", int, 0444,
127		"non-zero to enable FMR caching");
128
129/* NB: this value is shared by all CPTs, it can grow at runtime */
130static int pmr_pool_size = 512;
131CFS_MODULE_PARM(pmr_pool_size, "i", int, 0444,
132		"size of MR cache pmr pool on each CPT");
133
134/*
135 * 0: disable failover
136 * 1: enable failover if necessary
137 * 2: force to failover (for debug)
138 */
139static int dev_failover = 0;
140CFS_MODULE_PARM(dev_failover, "i", int, 0444,
141	       "HCA failover for bonding (0 off, 1 on, other values reserved)");
142
143
144static int require_privileged_port = 0;
145CFS_MODULE_PARM(require_privileged_port, "i", int, 0644,
146		"require privileged port when accepting connection");
147
148static int use_privileged_port = 1;
149CFS_MODULE_PARM(use_privileged_port, "i", int, 0644,
150		"use privileged port when initiating connection");
151
152kib_tunables_t kiblnd_tunables = {
153	.kib_dev_failover	   = &dev_failover,
154	.kib_service		= &service,
155	.kib_cksum		  = &cksum,
156	.kib_timeout		= &timeout,
157	.kib_keepalive	      = &keepalive,
158	.kib_ntx		    = &ntx,
159	.kib_credits		= &credits,
160	.kib_peertxcredits	  = &peer_credits,
161	.kib_peercredits_hiw	= &peer_credits_hiw,
162	.kib_peerrtrcredits	 = &peer_buffer_credits,
163	.kib_peertimeout	    = &peer_timeout,
164	.kib_default_ipif	   = &ipif_name,
165	.kib_retry_count	    = &retry_count,
166	.kib_rnr_retry_count	= &rnr_retry_count,
167	.kib_concurrent_sends       = &concurrent_sends,
168	.kib_ib_mtu		 = &ib_mtu,
169	.kib_map_on_demand	  = &map_on_demand,
170	.kib_fmr_pool_size	  = &fmr_pool_size,
171	.kib_fmr_flush_trigger      = &fmr_flush_trigger,
172	.kib_fmr_cache	      = &fmr_cache,
173	.kib_pmr_pool_size	  = &pmr_pool_size,
174	.kib_require_priv_port      = &require_privileged_port,
175	.kib_use_priv_port	    = &use_privileged_port,
176	.kib_nscheds		    = &nscheds
177};
178
179#if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM
180
181static char ipif_basename_space[32];
182
183
184enum {
185	O2IBLND_SERVICE  = 1,
186	O2IBLND_CKSUM,
187	O2IBLND_TIMEOUT,
188	O2IBLND_NTX,
189	O2IBLND_CREDITS,
190	O2IBLND_PEER_TXCREDITS,
191	O2IBLND_PEER_CREDITS_HIW,
192	O2IBLND_PEER_RTRCREDITS,
193	O2IBLND_PEER_TIMEOUT,
194	O2IBLND_IPIF_BASENAME,
195	O2IBLND_RETRY_COUNT,
196	O2IBLND_RNR_RETRY_COUNT,
197	O2IBLND_KEEPALIVE,
198	O2IBLND_CONCURRENT_SENDS,
199	O2IBLND_IB_MTU,
200	O2IBLND_MAP_ON_DEMAND,
201	O2IBLND_FMR_POOL_SIZE,
202	O2IBLND_FMR_FLUSH_TRIGGER,
203	O2IBLND_FMR_CACHE,
204	O2IBLND_PMR_POOL_SIZE,
205	O2IBLND_DEV_FAILOVER
206};
207
208static ctl_table_t kiblnd_ctl_table[] = {
209	{
210		.ctl_name = O2IBLND_SERVICE,
211		.procname = "service",
212		.data     = &service,
213		.maxlen   = sizeof(int),
214		.mode     = 0444,
215		.proc_handler = &proc_dointvec
216	},
217	{
218		.ctl_name = O2IBLND_CKSUM,
219		.procname = "cksum",
220		.data     = &cksum,
221		.maxlen   = sizeof(int),
222		.mode     = 0644,
223		.proc_handler = &proc_dointvec
224	},
225	{
226		.ctl_name = O2IBLND_TIMEOUT,
227		.procname = "timeout",
228		.data     = &timeout,
229		.maxlen   = sizeof(int),
230		.mode     = 0644,
231		.proc_handler = &proc_dointvec
232	},
233	{
234		.ctl_name = O2IBLND_NTX,
235		.procname = "ntx",
236		.data     = &ntx,
237		.maxlen   = sizeof(int),
238		.mode     = 0444,
239		.proc_handler = &proc_dointvec
240	},
241	{
242		.ctl_name = O2IBLND_CREDITS,
243		.procname = "credits",
244		.data     = &credits,
245		.maxlen   = sizeof(int),
246		.mode     = 0444,
247		.proc_handler = &proc_dointvec
248	},
249	{
250		.ctl_name = O2IBLND_PEER_TXCREDITS,
251		.procname = "peer_credits",
252		.data     = &peer_credits,
253		.maxlen   = sizeof(int),
254		.mode     = 0444,
255		.proc_handler = &proc_dointvec
256	},
257	{
258		.ctl_name = O2IBLND_PEER_CREDITS_HIW,
259		.procname = "peer_credits_hiw",
260		.data     = &peer_credits_hiw,
261		.maxlen   = sizeof(int),
262		.mode     = 0444,
263		.proc_handler = &proc_dointvec
264	},
265	{
266		.ctl_name = O2IBLND_PEER_RTRCREDITS,
267		.procname = "peer_buffer_credits",
268		.data     = &peer_buffer_credits,
269		.maxlen   = sizeof(int),
270		.mode     = 0444,
271		.proc_handler = &proc_dointvec
272	},
273	{
274		.ctl_name = O2IBLND_PEER_TIMEOUT,
275		.procname = "peer_timeout",
276		.data     = &peer_timeout,
277		.maxlen   = sizeof(int),
278		.mode     = 0444,
279		.proc_handler = &proc_dointvec
280	},
281	{
282		.ctl_name = O2IBLND_IPIF_BASENAME,
283		.procname = "ipif_name",
284		.data     = ipif_basename_space,
285		.maxlen   = sizeof(ipif_basename_space),
286		.mode     = 0444,
287		.proc_handler = &proc_dostring
288	},
289	{
290		.ctl_name = O2IBLND_RETRY_COUNT,
291		.procname = "retry_count",
292		.data     = &retry_count,
293		.maxlen   = sizeof(int),
294		.mode     = 0644,
295		.proc_handler = &proc_dointvec
296	},
297	{
298		.ctl_name = O2IBLND_RNR_RETRY_COUNT,
299		.procname = "rnr_retry_count",
300		.data     = &rnr_retry_count,
301		.maxlen   = sizeof(int),
302		.mode     = 0644,
303		.proc_handler = &proc_dointvec
304	},
305	{
306		.ctl_name = O2IBLND_KEEPALIVE,
307		.procname = "keepalive",
308		.data     = &keepalive,
309		.maxlen   = sizeof(int),
310		.mode     = 0644,
311		.proc_handler = &proc_dointvec
312	},
313	{
314		.ctl_name = O2IBLND_CONCURRENT_SENDS,
315		.procname = "concurrent_sends",
316		.data     = &concurrent_sends,
317		.maxlen   = sizeof(int),
318		.mode     = 0444,
319		.proc_handler = &proc_dointvec
320	},
321	{
322		.ctl_name = O2IBLND_IB_MTU,
323		.procname = "ib_mtu",
324		.data     = &ib_mtu,
325		.maxlen   = sizeof(int),
326		.mode     = 0444,
327		.proc_handler = &proc_dointvec
328	},
329	{
330		.ctl_name = O2IBLND_MAP_ON_DEMAND,
331		.procname = "map_on_demand",
332		.data     = &map_on_demand,
333		.maxlen   = sizeof(int),
334		.mode     = 0444,
335		.proc_handler = &proc_dointvec
336	},
337
338	{
339		.ctl_name = O2IBLND_FMR_POOL_SIZE,
340		.procname = "fmr_pool_size",
341		.data     = &fmr_pool_size,
342		.maxlen   = sizeof(int),
343		.mode     = 0444,
344		.proc_handler = &proc_dointvec
345	},
346	{
347		.ctl_name = O2IBLND_FMR_FLUSH_TRIGGER,
348		.procname = "fmr_flush_trigger",
349		.data     = &fmr_flush_trigger,
350		.maxlen   = sizeof(int),
351		.mode     = 0444,
352		.proc_handler = &proc_dointvec
353	},
354	{
355		.ctl_name = O2IBLND_FMR_CACHE,
356		.procname = "fmr_cache",
357		.data     = &fmr_cache,
358		.maxlen   = sizeof(int),
359		.mode     = 0444,
360		.proc_handler = &proc_dointvec
361	},
362	{
363		.ctl_name = O2IBLND_PMR_POOL_SIZE,
364		.procname = "pmr_pool_size",
365		.data     = &pmr_pool_size,
366		.maxlen   = sizeof(int),
367		.mode     = 0444,
368		.proc_handler = &proc_dointvec
369	},
370	{
371		.ctl_name = O2IBLND_DEV_FAILOVER,
372		.procname = "dev_failover",
373		.data     = &dev_failover,
374		.maxlen   = sizeof(int),
375		.mode     = 0444,
376		.proc_handler = &proc_dointvec
377	},
378	{0}
379};
380
381static ctl_table_t kiblnd_top_ctl_table[] = {
382	{
383		.ctl_name = CTL_O2IBLND,
384		.procname = "o2iblnd",
385		.data     = NULL,
386		.maxlen   = 0,
387		.mode     = 0555,
388		.child    = kiblnd_ctl_table
389	},
390	{0}
391};
392
393void
394kiblnd_initstrtunable(char *space, char *str, int size)
395{
396	strncpy(space, str, size);
397	space[size-1] = 0;
398}
399
400void
401kiblnd_sysctl_init (void)
402{
403	kiblnd_initstrtunable(ipif_basename_space, ipif_name,
404			      sizeof(ipif_basename_space));
405
406	kiblnd_tunables.kib_sysctl =
407		register_sysctl_table(kiblnd_top_ctl_table);
408
409	if (kiblnd_tunables.kib_sysctl == NULL)
410		CWARN("Can't setup /proc tunables\n");
411}
412
413void
414kiblnd_sysctl_fini (void)
415{
416	if (kiblnd_tunables.kib_sysctl != NULL)
417		unregister_sysctl_table(kiblnd_tunables.kib_sysctl);
418}
419
420#else
421
422void
423kiblnd_sysctl_init (void)
424{
425}
426
427void
428kiblnd_sysctl_fini (void)
429{
430}
431
432#endif
433
434int
435kiblnd_tunables_init (void)
436{
437	if (kiblnd_translate_mtu(*kiblnd_tunables.kib_ib_mtu) < 0) {
438		CERROR("Invalid ib_mtu %d, expected 256/512/1024/2048/4096\n",
439		       *kiblnd_tunables.kib_ib_mtu);
440		return -EINVAL;
441	}
442
443	if (*kiblnd_tunables.kib_peertxcredits < IBLND_CREDITS_DEFAULT)
444		*kiblnd_tunables.kib_peertxcredits = IBLND_CREDITS_DEFAULT;
445
446	if (*kiblnd_tunables.kib_peertxcredits > IBLND_CREDITS_MAX)
447		*kiblnd_tunables.kib_peertxcredits = IBLND_CREDITS_MAX;
448
449	if (*kiblnd_tunables.kib_peertxcredits > *kiblnd_tunables.kib_credits)
450		*kiblnd_tunables.kib_peertxcredits = *kiblnd_tunables.kib_credits;
451
452	if (*kiblnd_tunables.kib_peercredits_hiw < *kiblnd_tunables.kib_peertxcredits / 2)
453		*kiblnd_tunables.kib_peercredits_hiw = *kiblnd_tunables.kib_peertxcredits / 2;
454
455	if (*kiblnd_tunables.kib_peercredits_hiw >= *kiblnd_tunables.kib_peertxcredits)
456		*kiblnd_tunables.kib_peercredits_hiw = *kiblnd_tunables.kib_peertxcredits - 1;
457
458	if (*kiblnd_tunables.kib_map_on_demand < 0 ||
459	    *kiblnd_tunables.kib_map_on_demand > IBLND_MAX_RDMA_FRAGS)
460		*kiblnd_tunables.kib_map_on_demand = 0; /* disable map-on-demand */
461
462	if (*kiblnd_tunables.kib_map_on_demand == 1)
463		*kiblnd_tunables.kib_map_on_demand = 2; /* don't make sense to create map if only one fragment */
464
465	if (*kiblnd_tunables.kib_concurrent_sends == 0) {
466		if (*kiblnd_tunables.kib_map_on_demand > 0 &&
467		    *kiblnd_tunables.kib_map_on_demand <= IBLND_MAX_RDMA_FRAGS / 8)
468			*kiblnd_tunables.kib_concurrent_sends = (*kiblnd_tunables.kib_peertxcredits) * 2;
469		else
470			*kiblnd_tunables.kib_concurrent_sends = (*kiblnd_tunables.kib_peertxcredits);
471	}
472
473	if (*kiblnd_tunables.kib_concurrent_sends > *kiblnd_tunables.kib_peertxcredits * 2)
474		*kiblnd_tunables.kib_concurrent_sends = *kiblnd_tunables.kib_peertxcredits * 2;
475
476	if (*kiblnd_tunables.kib_concurrent_sends < *kiblnd_tunables.kib_peertxcredits / 2)
477		*kiblnd_tunables.kib_concurrent_sends = *kiblnd_tunables.kib_peertxcredits / 2;
478
479	if (*kiblnd_tunables.kib_concurrent_sends < *kiblnd_tunables.kib_peertxcredits) {
480		CWARN("Concurrent sends %d is lower than message queue size: %d, "
481		      "performance may drop slightly.\n",
482		      *kiblnd_tunables.kib_concurrent_sends, *kiblnd_tunables.kib_peertxcredits);
483	}
484
485	kiblnd_sysctl_init();
486	return 0;
487}
488
489void
490kiblnd_tunables_fini (void)
491{
492	kiblnd_sysctl_fini();
493}
494