Skip to content

Commit db2db50

Browse files
robnbehlendorf
authored andcommitted
spa: make read/write queues configurable
We are finding that as customers get larger and faster machines (hundreds of cores, large NVMe-backed pools) they keep hitting relatively low performance ceilings. Our profiling work almost always finds that they're running into bottlenecks on the SPA IO taskqs. Unfortunately there's often little we can advise at that point, because there's very few ways to change behaviour without patching. This commit adds two load-time parameters `zio_taskq_read` and `zio_taskq_write` that can configure the READ and WRITE IO taskqs directly. This achieves two goals: it gives operators (and those that support them) a way to tune things without requiring a custom build of OpenZFS, which is often not possible, and it lets us easily try different config variations in a variety of environments to inform the development of better defaults for these kind of systems. Because tuning the IO taskqs really requires a fairly deep understanding of how IO in ZFS works, and generally isn't needed without a pretty serious workload and an ability to identify bottlenecks, only minimal documentation is provided. Its expected that anyone using this is going to have the source code there as well. Signed-off-by: Rob Norris <[email protected]> Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc.
1 parent d530d5d commit db2db50

File tree

2 files changed

+306
-1
lines changed

2 files changed

+306
-1
lines changed

man/man4/zfs.4

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2280,6 +2280,16 @@ If
22802280
.Sy 0 ,
22812281
generate a system-dependent value close to 6 threads per taskq.
22822282
.
2283+
.It Sy zio_taskq_read Ns = Ns Sy fixed,1,8 null scale null Pq charp
2284+
Set the queue and thread configuration for the IO read queues.
2285+
This is an advanced debugging parameter.
2286+
Don't change this unless you understand what it does.
2287+
.
2288+
.It Sy zio_taskq_write Ns = Ns Sy batch fixed,1,5 scale fixed,1,5 Pq charp
2289+
Set the queue and thread configuration for the IO write queues.
2290+
This is an advanced debugging parameter.
2291+
Don't change this unless you understand what it does.
2292+
.
22832293
.It Sy zvol_inhibit_dev Ns = Ns Sy 0 Ns | Ns 1 Pq uint
22842294
Do not create zvol device nodes.
22852295
This may slightly improve startup time on

module/zfs/spa.c

Lines changed: 296 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@ static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
151151
* and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that
152152
* need to be handled with minimum delay.
153153
*/
154-
static const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
154+
static zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
155155
/* ISSUE ISSUE_HIGH INTR INTR_HIGH */
156156
{ ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */
157157
{ ZTI_N(8), ZTI_NULL, ZTI_SCALE, ZTI_NULL }, /* READ */
@@ -1164,6 +1164,292 @@ spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
11641164
tqs->stqs_taskq = NULL;
11651165
}
11661166

1167+
#ifdef _KERNEL
1168+
/*
1169+
* The READ and WRITE rows of zio_taskqs are configurable at module load time
1170+
* by setting zio_taskq_read or zio_taskq_write.
1171+
*
1172+
* Example (the defaults for READ and WRITE)
1173+
* zio_taskq_read='fixed,1,8 null scale null'
1174+
* zio_taskq_write='batch fixed,1,5 scale fixed,1,5'
1175+
*
1176+
* Each sets the entire row at a time.
1177+
*
1178+
* 'fixed' is parameterised: fixed,Q,T where Q is number of taskqs, T is number
1179+
* of threads per taskq.
1180+
*
1181+
* 'null' can only be set on the high-priority queues (queue selection for
1182+
* high-priority queues will fall back to the regular queue if the high-pri
1183+
* is NULL.
1184+
*/
1185+
static const char *const modes[ZTI_NMODES] = {
1186+
"fixed", "batch", "scale", "null"
1187+
};
1188+
1189+
/* Parse the incoming config string. Modifies cfg */
1190+
static int
1191+
spa_taskq_param_set(zio_type_t t, char *cfg)
1192+
{
1193+
int err = 0;
1194+
1195+
zio_taskq_info_t row[ZIO_TASKQ_TYPES] = {{0}};
1196+
1197+
char *next = cfg, *tok, *c;
1198+
1199+
/*
1200+
* Parse out each element from the string and fill `row`. The entire
1201+
* row has to be set at once, so any errors are flagged by just
1202+
* breaking out of this loop early.
1203+
*/
1204+
uint_t q;
1205+
for (q = 0; q < ZIO_TASKQ_TYPES; q++) {
1206+
/* `next` is the start of the config */
1207+
if (next == NULL)
1208+
break;
1209+
1210+
/* Eat up leading space */
1211+
while (isspace(*next))
1212+
next++;
1213+
if (*next == '\0')
1214+
break;
1215+
1216+
/* Mode ends at space or end of string */
1217+
tok = next;
1218+
next = strchr(tok, ' ');
1219+
if (next != NULL) *next++ = '\0';
1220+
1221+
/* Parameters start after a comma */
1222+
c = strchr(tok, ',');
1223+
if (c != NULL) *c++ = '\0';
1224+
1225+
/* Match mode string */
1226+
uint_t mode;
1227+
for (mode = 0; mode < ZTI_NMODES; mode++)
1228+
if (strcmp(tok, modes[mode]) == 0)
1229+
break;
1230+
if (mode == ZTI_NMODES)
1231+
break;
1232+
1233+
/* Invalid canary */
1234+
row[q].zti_mode = ZTI_NMODES;
1235+
1236+
/* Per-mode setup */
1237+
switch (mode) {
1238+
1239+
/*
1240+
* FIXED is parameterised: number of queues, and number of
1241+
* threads per queue.
1242+
*/
1243+
case ZTI_MODE_FIXED: {
1244+
/* No parameters? */
1245+
if (c == NULL || *c == '\0')
1246+
break;
1247+
1248+
/* Find next parameter */
1249+
tok = c;
1250+
c = strchr(tok, ',');
1251+
if (c == NULL)
1252+
break;
1253+
1254+
/* Take digits and convert */
1255+
unsigned long long nq;
1256+
if (!(isdigit(*tok)))
1257+
break;
1258+
err = ddi_strtoull(tok, &tok, 10, &nq);
1259+
/* Must succeed and also end at the next param sep */
1260+
if (err != 0 || tok != c)
1261+
break;
1262+
1263+
/* Move past the comma */
1264+
tok++;
1265+
/* Need another number */
1266+
if (!(isdigit(*tok)))
1267+
break;
1268+
/* Remember start to make sure we moved */
1269+
c = tok;
1270+
1271+
/* Take digits */
1272+
unsigned long long ntpq;
1273+
err = ddi_strtoull(tok, &tok, 10, &ntpq);
1274+
/* Must succeed, and moved forward */
1275+
if (err != 0 || tok == c || *tok != '\0')
1276+
break;
1277+
1278+
/*
1279+
* sanity; zero queues/threads make no sense, and
1280+
* 16K is almost certainly more than anyone will ever
1281+
* need and avoids silly numbers like UINT32_MAX
1282+
*/
1283+
if (nq == 0 || nq >= 16384 ||
1284+
ntpq == 0 || ntpq >= 16384)
1285+
break;
1286+
1287+
const zio_taskq_info_t zti = ZTI_P(ntpq, nq);
1288+
row[q] = zti;
1289+
break;
1290+
}
1291+
1292+
case ZTI_MODE_BATCH: {
1293+
const zio_taskq_info_t zti = ZTI_BATCH;
1294+
row[q] = zti;
1295+
break;
1296+
}
1297+
1298+
case ZTI_MODE_SCALE: {
1299+
const zio_taskq_info_t zti = ZTI_SCALE;
1300+
row[q] = zti;
1301+
break;
1302+
}
1303+
1304+
case ZTI_MODE_NULL: {
1305+
/*
1306+
* Can only null the high-priority queues; the general-
1307+
* purpose ones have to exist.
1308+
*/
1309+
if (q != ZIO_TASKQ_ISSUE_HIGH &&
1310+
q != ZIO_TASKQ_INTERRUPT_HIGH)
1311+
break;
1312+
1313+
const zio_taskq_info_t zti = ZTI_NULL;
1314+
row[q] = zti;
1315+
break;
1316+
}
1317+
1318+
default:
1319+
break;
1320+
}
1321+
1322+
/* Ensure we set a mode */
1323+
if (row[q].zti_mode == ZTI_NMODES)
1324+
break;
1325+
}
1326+
1327+
/* Didn't get a full row, fail */
1328+
if (q < ZIO_TASKQ_TYPES)
1329+
return (SET_ERROR(EINVAL));
1330+
1331+
/* Eat trailing space */
1332+
if (next != NULL)
1333+
while (isspace(*next))
1334+
next++;
1335+
1336+
/* If there's anything left over then fail */
1337+
if (next != NULL && *next != '\0')
1338+
return (SET_ERROR(EINVAL));
1339+
1340+
/* Success! Copy it into the real config */
1341+
for (q = 0; q < ZIO_TASKQ_TYPES; q++)
1342+
zio_taskqs[t][q] = row[q];
1343+
1344+
return (0);
1345+
}
1346+
1347+
static int
1348+
spa_taskq_param_get(zio_type_t t, char *buf)
1349+
{
1350+
int pos = 0;
1351+
1352+
/* Build paramater string from live config */
1353+
const char *sep = "";
1354+
for (uint_t q = 0; q < ZIO_TASKQ_TYPES; q++) {
1355+
const zio_taskq_info_t *zti = &zio_taskqs[t][q];
1356+
if (zti->zti_mode == ZTI_MODE_FIXED)
1357+
pos += sprintf(&buf[pos], "%s%s,%u,%u", sep,
1358+
modes[zti->zti_mode], zti->zti_count,
1359+
zti->zti_value);
1360+
else
1361+
pos += sprintf(&buf[pos], "%s%s", sep,
1362+
modes[zti->zti_mode]);
1363+
sep = " ";
1364+
}
1365+
1366+
buf[pos++] = '\n';
1367+
buf[pos] = '\0';
1368+
1369+
return (pos);
1370+
}
1371+
1372+
#ifdef __linux__
1373+
static int
1374+
spa_taskq_read_param_set(const char *val, zfs_kernel_param_t *kp)
1375+
{
1376+
char *cfg = kmem_strdup(val);
1377+
int err = spa_taskq_param_set(ZIO_TYPE_READ, cfg);
1378+
kmem_free(cfg, strlen(val)+1);
1379+
return (-err);
1380+
}
1381+
static int
1382+
spa_taskq_read_param_get(char *buf, zfs_kernel_param_t *kp)
1383+
{
1384+
return (spa_taskq_param_get(ZIO_TYPE_READ, buf));
1385+
}
1386+
1387+
static int
1388+
spa_taskq_write_param_set(const char *val, zfs_kernel_param_t *kp)
1389+
{
1390+
char *cfg = kmem_strdup(val);
1391+
int err = spa_taskq_param_set(ZIO_TYPE_WRITE, cfg);
1392+
kmem_free(cfg, strlen(val)+1);
1393+
return (-err);
1394+
}
1395+
static int
1396+
spa_taskq_write_param_get(char *buf, zfs_kernel_param_t *kp)
1397+
{
1398+
return (spa_taskq_param_get(ZIO_TYPE_WRITE, buf));
1399+
}
1400+
#else
1401+
#include <sys/sbuf.h>
1402+
1403+
/*
1404+
* On FreeBSD load-time parameters can be set up before malloc() is available,
1405+
* so we have to do all the parsing work on the stack.
1406+
*/
1407+
#define SPA_TASKQ_PARAM_MAX (128)
1408+
1409+
static int
1410+
spa_taskq_read_param(ZFS_MODULE_PARAM_ARGS)
1411+
{
1412+
char buf[SPA_TASKQ_PARAM_MAX];
1413+
int err = 0;
1414+
1415+
if (req->newptr == NULL) {
1416+
int len = spa_taskq_param_get(ZIO_TYPE_READ, buf);
1417+
struct sbuf *s = sbuf_new_for_sysctl(NULL, NULL, len+1, req);
1418+
sbuf_cpy(s, buf);
1419+
err = sbuf_finish(s);
1420+
sbuf_delete(s);
1421+
return (err);
1422+
}
1423+
1424+
err = sysctl_handle_string(oidp, buf, sizeof (buf), req);
1425+
if (err)
1426+
return (err);
1427+
return (spa_taskq_param_set(ZIO_TYPE_READ, buf));
1428+
}
1429+
1430+
static int
1431+
spa_taskq_write_param(ZFS_MODULE_PARAM_ARGS)
1432+
{
1433+
char buf[SPA_TASKQ_PARAM_MAX];
1434+
int err = 0;
1435+
1436+
if (req->newptr == NULL) {
1437+
int len = spa_taskq_param_get(ZIO_TYPE_WRITE, buf);
1438+
struct sbuf *s = sbuf_new_for_sysctl(NULL, NULL, len+1, req);
1439+
sbuf_cpy(s, buf);
1440+
err = sbuf_finish(s);
1441+
sbuf_delete(s);
1442+
return (err);
1443+
}
1444+
1445+
err = sysctl_handle_string(oidp, buf, sizeof (buf), req);
1446+
if (err)
1447+
return (err);
1448+
return (spa_taskq_param_set(ZIO_TYPE_WRITE, buf));
1449+
}
1450+
#endif
1451+
#endif /* _KERNEL */
1452+
11671453
/*
11681454
* Dispatch a task to the appropriate taskq for the ZFS I/O type and priority.
11691455
* Note that a type may have multiple discrete taskqs to avoid lock contention
@@ -10210,4 +10496,13 @@ ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT,
1021010496
ZMOD_RW,
1021110497
"Whether extra ALLOC blkptrs were added to a livelist entry while it "
1021210498
"was being condensed");
10499+
10500+
#ifdef _KERNEL
10501+
ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_read,
10502+
spa_taskq_read_param_set, spa_taskq_read_param_get, ZMOD_RD,
10503+
"Configure IO queues for read IO");
10504+
ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_write,
10505+
spa_taskq_write_param_set, spa_taskq_write_param_get, ZMOD_RD,
10506+
"Configure IO queues for write IO");
10507+
#endif
1021310508
/* END CSTYLED */

0 commit comments

Comments
 (0)