Skip to content

Commit 6aba5bc

Browse files
tonyhuttercalccrypto
authored andcommitted
Linux 6.9: Call add_disk() from workqueue to fix zfs_allow_010_pos (openzfs#16282)
The 6.9 kernel behaves differently in how it releases block devices. In the common case it will async release the device only after the return to userspace. This is different from the 6.8 and older kernels which release the block devices synchronously. To get around this, call add_disk() from a workqueue so that the kernel uses a different codepath to release our zvols in the way we expect. This stops zfs_allow_010_pos from hanging. Fixes: openzfs#16089 Signed-off-by: Tony Hutter <[email protected]> Reviewed-by: Tino Reichardt <[email protected]> Reviewed-by: Rob Norris <[email protected]>
1 parent cc2a897 commit 6aba5bc

File tree

1 file changed

+97
-5
lines changed

1 file changed

+97
-5
lines changed

module/os/linux/zfs/zvol_os.c

Lines changed: 97 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141

4242
#include <linux/blkdev_compat.h>
4343
#include <linux/task_io_accounting_ops.h>
44+
#include <linux/workqueue.h>
4445

4546
#ifdef HAVE_BLK_MQ
4647
#include <linux/blk-mq.h>
@@ -1338,6 +1339,101 @@ zvol_wait_close(zvol_state_t *zv)
13381339
{
13391340
}
13401341

1342+
struct add_disk_work {
1343+
struct delayed_work work;
1344+
struct gendisk *disk;
1345+
int error;
1346+
};
1347+
1348+
static int
1349+
__zvol_os_add_disk(struct gendisk *disk)
1350+
{
1351+
int error = 0;
1352+
#ifdef HAVE_ADD_DISK_RET
1353+
error = add_disk(disk);
1354+
#else
1355+
add_disk(disk);
1356+
#endif
1357+
return (error);
1358+
}
1359+
1360+
#if defined(HAVE_BDEV_FILE_OPEN_BY_PATH)
1361+
static void
1362+
zvol_os_add_disk_work(struct work_struct *work)
1363+
{
1364+
struct add_disk_work *add_disk_work;
1365+
add_disk_work = container_of(work, struct add_disk_work, work.work);
1366+
add_disk_work->error = __zvol_os_add_disk(add_disk_work->disk);
1367+
}
1368+
#endif
1369+
1370+
/*
1371+
* SPECIAL CASE:
1372+
*
1373+
* This function basically calls add_disk() from a workqueue. You may be
1374+
* thinking: why not just call add_disk() directly?
1375+
*
1376+
* When you call add_disk(), the zvol appears to the world. When this happens,
1377+
* the kernel calls disk_scan_partitions() on the zvol, which behaves
1378+
* differently on the 6.9+ kernels:
1379+
*
1380+
* - 6.8 and older kernels -
1381+
* disk_scan_partitions()
1382+
* handle = bdev_open_by_dev(
1383+
* zvol_open()
1384+
* bdev_release(handle);
1385+
* zvol_release()
1386+
*
1387+
*
1388+
* - 6.9+ kernels -
1389+
* disk_scan_partitions()
1390+
* file = bdev_file_open_by_dev()
1391+
* zvol_open()
1392+
* fput(file)
1393+
* < wait for return to userspace >
1394+
* zvol_release()
1395+
*
1396+
* The difference is that the bdev_release() from the 6.8 kernel is synchronous
1397+
* while the fput() from the 6.9 kernel is async. Or more specifically it's
1398+
* async that has to wait until we return to userspace (since it adds the fput
1399+
* into the caller's work queue with the TWA_RESUME flag set). This is not the
1400+
* behavior we want, since we want do things like create+destroy a zvol within
1401+
* a single ZFS_IOC_CREATE ioctl, and the "create" part needs to release the
1402+
* reference to the zvol while we're in the IOCTL, which can't wait until we
1403+
* return to userspace.
1404+
*
1405+
* We can get around this since fput() has a special codepath for when it's
1406+
* running in a kernel thread or interrupt. In those cases, it just puts the
1407+
* fput into the system workqueue, which we can force to run with
1408+
* __flush_workqueue(). That is why we call add_disk() from a workqueue - so it
1409+
* run from a kernel thread and "tricks" the fput() codepaths.
1410+
*
1411+
* Note that __flush_workqueue() is slowly getting deprecated. This may be ok
1412+
* though, since our IOCTL will spin on EBUSY waiting for the zvol release (via
1413+
* fput) to happen, which it eventually, naturally, will from the system_wq
1414+
* without us explicitly calling __flush_workqueue().
1415+
*/
1416+
static int
1417+
zvol_os_add_disk(struct gendisk *disk)
1418+
{
1419+
#if defined(HAVE_BDEV_FILE_OPEN_BY_PATH) /* 6.9+ kernel */
1420+
struct add_disk_work add_disk_work;
1421+
1422+
INIT_DELAYED_WORK(&add_disk_work.work, zvol_os_add_disk_work);
1423+
add_disk_work.disk = disk;
1424+
add_disk_work.error = 0;
1425+
1426+
/* Use *_delayed_work functions since they're not GPL'd */
1427+
schedule_delayed_work(&add_disk_work.work, 0);
1428+
flush_delayed_work(&add_disk_work.work);
1429+
1430+
__flush_workqueue(system_wq);
1431+
return (add_disk_work.error);
1432+
#else /* <= 6.8 kernel */
1433+
return (__zvol_os_add_disk(disk));
1434+
#endif
1435+
}
1436+
13411437
/*
13421438
* Create a block device minor node and setup the linkage between it
13431439
* and the specified volume. Once this function returns the block
@@ -1549,11 +1645,7 @@ zvol_os_create_minor(const char *name)
15491645
rw_enter(&zvol_state_lock, RW_WRITER);
15501646
zvol_insert(zv);
15511647
rw_exit(&zvol_state_lock);
1552-
#ifdef HAVE_ADD_DISK_RET
1553-
error = add_disk(zv->zv_zso->zvo_disk);
1554-
#else
1555-
add_disk(zv->zv_zso->zvo_disk);
1556-
#endif
1648+
error = zvol_os_add_disk(zv->zv_zso->zvo_disk);
15571649
} else {
15581650
ida_simple_remove(&zvol_ida, idx);
15591651
}

0 commit comments

Comments
 (0)