Skip to content

Commit a0aeea0

Browse files
committed
spa: make read/write queues configurable
We are finding that as customers get larger and faster machines (hundreds of cores, large NVMe-backed pools) they keep hitting relatively low performance ceilings. Our profiling work almost always finds that they're running into bottlenecks on the SPA IO taskqs. Unfortunately there's often little we can advise at that point, because there's very few ways to change behaviour without patching. This commit adds two load-time parameters `zio_taskq_read` and `zio_taskq_write` that can configure the READ and WRITE IO taskqs directly. This achieves two goals: it gives operators (and those that support them) a way to tune things without requiring a custom build of OpenZFS, which is often not possible, and it lets us easily try different config variations in a variety of environments to inform the development of better defaults for these kind of systems. Because tuning the IO taskqs really requires a fairly deep understanding of how IO in ZFS works, and generally isn't needed without a pretty serious workload and an ability to identify bottlenecks, only minimal documentation is provided. Its expected that anyone using this is going to have the source code there as well. Signed-off-by: Rob Norris <[email protected]> Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc.
1 parent 494aaae commit a0aeea0

File tree

2 files changed

+254
-1
lines changed

2 files changed

+254
-1
lines changed

man/man4/zfs.4

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2280,6 +2280,16 @@ If
22802280
.Sy 0 ,
22812281
generate a system-dependent value close to 6 threads per taskq.
22822282
.
2283+
.It Sy zio_taskq_read Ns = Ns Sy fixed,1,8 null scale null Pq charp
2284+
Set the queue and thread configuration for the IO read queues.
2285+
This is an advanced debugging parameter.
2286+
Don't change this unless you understand what it does.
2287+
.
2288+
.It Sy zio_taskq_write Ns = Ns Sy batch fixed,1,5 scale fixed,1,5 Pq charp
2289+
Set the queue and thread configuration for the IO write queues.
2290+
This is an advanced debugging parameter.
2291+
Don't change this unless you understand what it does.
2292+
.
22832293
.It Sy zvol_inhibit_dev Ns = Ns Sy 0 Ns | Ns 1 Pq uint
22842294
Do not create zvol device nodes.
22852295
This may slightly improve startup time on

module/zfs/spa.c

Lines changed: 244 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@ static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
151151
* and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that
152152
* need to be handled with minimum delay.
153153
*/
154-
static const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
154+
static zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
155155
/* ISSUE ISSUE_HIGH INTR INTR_HIGH */
156156
{ ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */
157157
{ ZTI_N(8), ZTI_NULL, ZTI_SCALE, ZTI_NULL }, /* READ */
@@ -1164,6 +1164,240 @@ spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
11641164
tqs->stqs_taskq = NULL;
11651165
}
11661166

1167+
#ifdef _KERNEL
1168+
/*
1169+
* The READ and WRITE rows of zio_taskqs are configurable at module load time
1170+
* by setting zio_taskq_read or zio_taskq_write.
1171+
*
1172+
* Example (the defaults for READ and WRITE)
1173+
* zio_taskq_read='fixed,1,8 null scale null'
1174+
* zio_taskq_write='batch fixed,1,5 scale fixed,1,5'
1175+
*
1176+
* Each sets the entire row at a time.
1177+
*
1178+
* 'fixed' is parameterised: fixed,Q,T where Q is number of taskqs, T is number
1179+
* of threads per taskq.
1180+
*
1181+
* 'null' can only be set on the high-priority queues (queue selection for
1182+
* high-priority queues will fall back to the regular queue if the high-pri
1183+
* is NULL.
1184+
*/
1185+
static const char *const modes[ZTI_NMODES] = {
1186+
"fixed", "batch", "scale", "null"
1187+
};
1188+
1189+
static int
1190+
spa_taskq_param_set(zio_type_t t, const char *val, zfs_kernel_param_t *kp)
1191+
{
1192+
int err = 0;
1193+
1194+
zio_taskq_info_t row[ZIO_TASKQ_TYPES] = {0};
1195+
1196+
char *cfg = kmem_strdup(val);
1197+
char *next = cfg, *tok, *c;
1198+
1199+
/*
1200+
* Parse out each element from the string and fill `row`. The entire
1201+
* row has to be set at once, so any errors are flagged by just
1202+
* breaking out of this loop early.
1203+
*/
1204+
uint_t q;
1205+
for (q = 0; q < ZIO_TASKQ_TYPES; q++) {
1206+
/* `next` is the start of the config */
1207+
if (next == NULL)
1208+
break;
1209+
1210+
/* Eat up leading space */
1211+
while (isspace(*next))
1212+
next++;
1213+
if (*next == '\0')
1214+
break;
1215+
1216+
/* Mode ends at space or end of string */
1217+
tok = next;
1218+
next = strchr(tok, ' ');
1219+
if (next != NULL) *next++ = '\0';
1220+
1221+
/* Parameters start after a comma */
1222+
c = strchr(tok, ',');
1223+
if (c != NULL) *c++ = '\0';
1224+
1225+
/* Match mode string */
1226+
uint_t mode;
1227+
for (mode = 0; mode < ZTI_NMODES; mode++)
1228+
if (strcmp(tok, modes[mode]) == 0)
1229+
break;
1230+
if (mode == ZTI_NMODES)
1231+
break;
1232+
1233+
/* Invalid canary */
1234+
row[q].zti_mode = ZTI_NMODES;
1235+
1236+
/* Per-mode setup */
1237+
switch (mode) {
1238+
1239+
/*
1240+
* FIXED is parameterised: number of queues, and number of
1241+
* threads per queue.
1242+
*/
1243+
case ZTI_MODE_FIXED: {
1244+
/* No parameters? */
1245+
if (c == NULL || *c == '\0')
1246+
break;
1247+
1248+
/* Find next parameter */
1249+
tok = c;
1250+
c = strchr(tok, ',');
1251+
if (c == NULL)
1252+
break;
1253+
1254+
/* Take digits and convert */
1255+
unsigned long long nq;
1256+
if (!(isdigit(*tok)))
1257+
break;
1258+
err = ddi_strtoull(tok, &tok, 10, &nq);
1259+
/* Must succeed and also end at the next param sep */
1260+
if (err != 0 || tok != c)
1261+
break;
1262+
1263+
/* Move past the comma */
1264+
tok++;
1265+
/* Need another number */
1266+
if (!(isdigit(*tok)))
1267+
break;
1268+
/* Remember start to make sure we moved */
1269+
c = tok;
1270+
1271+
/* Take digits */
1272+
unsigned long long ntpq;
1273+
err = ddi_strtoull(tok, &tok, 10, &ntpq);
1274+
/* Must succeed, and moved forward */
1275+
if (err != 0 || tok == c || *tok != '\0')
1276+
break;
1277+
1278+
/*
1279+
* sanity; zero queues/threads make no sense, and
1280+
* 16K is almost certainly more than anyone will ever
1281+
* need and avoids silly numbers like UINT32_MAX
1282+
*/
1283+
if (nq == 0 || nq >= 16384 ||
1284+
ntpq == 0 || ntpq >= 16384)
1285+
break;
1286+
1287+
const zio_taskq_info_t zti = ZTI_P(ntpq, nq);
1288+
row[q] = zti;
1289+
break;
1290+
}
1291+
1292+
case ZTI_MODE_BATCH: {
1293+
const zio_taskq_info_t zti = ZTI_BATCH;
1294+
row[q] = zti;
1295+
break;
1296+
}
1297+
1298+
case ZTI_MODE_SCALE: {
1299+
const zio_taskq_info_t zti = ZTI_SCALE;
1300+
row[q] = zti;
1301+
break;
1302+
}
1303+
1304+
case ZTI_MODE_NULL: {
1305+
/*
1306+
* Can only null the high-priority queues; the general-
1307+
* purpose ones have to exist.
1308+
*/
1309+
if (q != ZIO_TASKQ_ISSUE_HIGH &&
1310+
q != ZIO_TASKQ_INTERRUPT_HIGH)
1311+
break;
1312+
1313+
const zio_taskq_info_t zti = ZTI_NULL;
1314+
row[q] = zti;
1315+
break;
1316+
}
1317+
1318+
default:
1319+
break;
1320+
}
1321+
1322+
/* Ensure we set a mode */
1323+
if (row[q].zti_mode == ZTI_NMODES)
1324+
break;
1325+
}
1326+
1327+
/* Eat trailing space */
1328+
if (next != NULL)
1329+
while (isspace(*next))
1330+
next++;
1331+
1332+
/* If there's anything left over then fail */
1333+
if (next != NULL && *next != '\0') {
1334+
err = SET_ERROR(EINVAL);
1335+
goto out;
1336+
}
1337+
1338+
/* Didn't get a full row, fail */
1339+
if (q < ZIO_TASKQ_TYPES) {
1340+
err = SET_ERROR(EINVAL);
1341+
goto out;
1342+
}
1343+
1344+
/* Success! Copy it into the real config */
1345+
for (q = 0; q < ZIO_TASKQ_TYPES; q++)
1346+
zio_taskqs[t][q] = row[q];
1347+
1348+
out:
1349+
kmem_free(cfg, strlen(val) + 1);
1350+
return (err);
1351+
}
1352+
1353+
static int
1354+
spa_taskq_param_get(zio_type_t t, char *buf, zfs_kernel_param_t *kp)
1355+
{
1356+
int pos = 0;
1357+
1358+
/* Build paramater string from live config */
1359+
const char *sep = "";
1360+
for (uint_t q = 0; q < ZIO_TASKQ_TYPES; q++) {
1361+
const zio_taskq_info_t *zti = &zio_taskqs[t][q];
1362+
if (zti->zti_mode == ZTI_MODE_FIXED)
1363+
pos += sprintf(&buf[pos], "%s%s,%u,%u", sep,
1364+
modes[zti->zti_mode], zti->zti_count,
1365+
zti->zti_value);
1366+
else
1367+
pos += sprintf(&buf[pos], "%s%s", sep,
1368+
modes[zti->zti_mode]);
1369+
sep = " ";
1370+
}
1371+
1372+
buf[pos++] = '\n';
1373+
buf[pos] = '\0';
1374+
1375+
return (pos);
1376+
}
1377+
1378+
static int
1379+
spa_taskq_read_param_set(const char *val, zfs_kernel_param_t *kp)
1380+
{
1381+
return (spa_taskq_param_set(ZIO_TYPE_READ, val, kp));
1382+
}
1383+
static int
1384+
spa_taskq_read_param_get(char *buf, zfs_kernel_param_t *kp)
1385+
{
1386+
return (spa_taskq_param_get(ZIO_TYPE_READ, buf, kp));
1387+
}
1388+
1389+
static int
1390+
spa_taskq_write_param_set(const char *val, zfs_kernel_param_t *kp)
1391+
{
1392+
return (spa_taskq_param_set(ZIO_TYPE_WRITE, val, kp));
1393+
}
1394+
static int
1395+
spa_taskq_write_param_get(char *buf, zfs_kernel_param_t *kp)
1396+
{
1397+
return (spa_taskq_param_get(ZIO_TYPE_WRITE, buf, kp));
1398+
}
1399+
#endif
1400+
11671401
/*
11681402
* Dispatch a task to the appropriate taskq for the ZFS I/O type and priority.
11691403
* Note that a type may have multiple discrete taskqs to avoid lock contention
@@ -10210,4 +10444,13 @@ ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT,
1021010444
ZMOD_RW,
1021110445
"Whether extra ALLOC blkptrs were added to a livelist entry while it "
1021210446
"was being condensed");
10447+
10448+
#ifdef _KERNEL
10449+
ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_read,
10450+
spa_taskq_read_param_set, spa_taskq_read_param_get, ZMOD_RD,
10451+
"Configure IO queues for read IO");
10452+
ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_write,
10453+
spa_taskq_write_param_set, spa_taskq_write_param_get, ZMOD_RD,
10454+
"Configure IO queues for write IO");
10455+
#endif
1021310456
/* END CSTYLED */

0 commit comments

Comments
 (0)