Skip to content

Commit 1eacf2b

Browse files
gllghrbehlendorf
authored andcommitted
Improve rate at which new zvols are processed
The kernel function which adds new zvols as disks to the system, add_disk(), briefly opens and closes the zvol as part of its work. Closing a zvol involves waiting for two txgs to sync. This, combined with the fact that the taskq processing new zvols is single threaded, makes this processing new zvols slow. Waiting for these txgs to sync is only necessary if the zvol has been written to, which is not the case during add_disk(). This change adds tracking of whether a zvol has been written to so that we can skip the txg_wait_synced() calls when they are unnecessary. This change also fixes the flags passed to blkdev_get_by_path() by vdev_disk_open() to be FMODE_READ | FMODE_WRITE | FMODE_EXCL instead of just FMODE_EXCL. The flags were being incorrectly calculated because we were using the wrong version of vdev_bdev_mode(). Reviewed-by: George Wilson <[email protected]> Reviewed-by: Serapheim Dimitropoulos <[email protected]> Reviewed-by: Brian Behlendorf <[email protected]> Signed-off-by: John Gallagher <[email protected]> Closes #8526 Closes #8615
1 parent b3b6098 commit 1eacf2b

File tree

2 files changed

+116
-86
lines changed

2 files changed

+116
-86
lines changed

module/zfs/vdev_disk.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
2424
* Rewritten for Linux by Brian Behlendorf <[email protected]>.
2525
* LLNL-CODE-403049.
26-
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
26+
* Copyright (c) 2012, 2019 by Delphix. All rights reserved.
2727
*/
2828

2929
#include <sys/zfs_context.h>
@@ -56,7 +56,7 @@ typedef struct dio_request {
5656
} dio_request_t;
5757

5858

59-
#ifdef HAVE_OPEN_BDEV_EXCLUSIVE
59+
#if defined(HAVE_OPEN_BDEV_EXCLUSIVE) || defined(HAVE_BLKDEV_GET_BY_PATH)
6060
static fmode_t
6161
vdev_bdev_mode(int smode)
6262
{

module/zfs/zvol.c

Lines changed: 114 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
*
3737
* Copyright 2014 Nexenta Systems, Inc. All rights reserved.
3838
* Copyright (c) 2016 Actifio, Inc. All rights reserved.
39-
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
39+
* Copyright (c) 2012, 2019 by Delphix. All rights reserved.
4040
*/
4141

4242
/*
@@ -155,6 +155,11 @@ typedef struct {
155155
} zvol_task_t;
156156

157157
#define ZVOL_RDONLY 0x1
158+
/*
159+
* Whether the zvol has been written to (as opposed to ZVOL_RDONLY, which
160+
* specifies whether or not the zvol _can_ be written to)
161+
*/
162+
#define ZVOL_WRITTEN_TO 0x2
158163

159164
static uint64_t
160165
zvol_name_hash(const char *name)
@@ -742,6 +747,7 @@ zvol_write(void *arg)
742747

743748
zvol_state_t *zv = zvr->zv;
744749
ASSERT(zv && zv->zv_open_count > 0);
750+
ASSERT(zv->zv_zilog != NULL);
745751

746752
ssize_t start_resid = uio.uio_resid;
747753
unsigned long start_jif = jiffies;
@@ -832,6 +838,7 @@ zvol_discard(void *arg)
832838
unsigned long start_jif;
833839

834840
ASSERT(zv && zv->zv_open_count > 0);
841+
ASSERT(zv->zv_zilog != NULL);
835842

836843
start_jif = jiffies;
837844
blk_generic_start_io_acct(zv->zv_queue, WRITE, bio_sectors(bio),
@@ -930,6 +937,86 @@ zvol_read(void *arg)
930937
kmem_free(zvr, sizeof (zv_request_t));
931938
}
932939

940+
/* ARGSUSED */
941+
static void
942+
zvol_get_done(zgd_t *zgd, int error)
943+
{
944+
if (zgd->zgd_db)
945+
dmu_buf_rele(zgd->zgd_db, zgd);
946+
947+
rangelock_exit(zgd->zgd_lr);
948+
949+
kmem_free(zgd, sizeof (zgd_t));
950+
}
951+
952+
/*
953+
* Get data to generate a TX_WRITE intent log record.
954+
*/
955+
static int
956+
zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
957+
{
958+
zvol_state_t *zv = arg;
959+
uint64_t offset = lr->lr_offset;
960+
uint64_t size = lr->lr_length;
961+
dmu_buf_t *db;
962+
zgd_t *zgd;
963+
int error;
964+
965+
ASSERT3P(lwb, !=, NULL);
966+
ASSERT3P(zio, !=, NULL);
967+
ASSERT3U(size, !=, 0);
968+
969+
zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
970+
zgd->zgd_lwb = lwb;
971+
972+
/*
973+
* Write records come in two flavors: immediate and indirect.
974+
* For small writes it's cheaper to store the data with the
975+
* log record (immediate); for large writes it's cheaper to
976+
* sync the data and get a pointer to it (indirect) so that
977+
* we don't have to write the data twice.
978+
*/
979+
if (buf != NULL) { /* immediate write */
980+
zgd->zgd_lr = rangelock_enter(&zv->zv_rangelock, offset, size,
981+
RL_READER);
982+
error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf,
983+
DMU_READ_NO_PREFETCH);
984+
} else { /* indirect write */
985+
/*
986+
* Have to lock the whole block to ensure when it's written out
987+
* and its checksum is being calculated that no one can change
988+
* the data. Contrarily to zfs_get_data we need not re-check
989+
* blocksize after we get the lock because it cannot be changed.
990+
*/
991+
size = zv->zv_volblocksize;
992+
offset = P2ALIGN_TYPED(offset, size, uint64_t);
993+
zgd->zgd_lr = rangelock_enter(&zv->zv_rangelock, offset, size,
994+
RL_READER);
995+
error = dmu_buf_hold_by_dnode(zv->zv_dn, offset, zgd, &db,
996+
DMU_READ_NO_PREFETCH);
997+
if (error == 0) {
998+
blkptr_t *bp = &lr->lr_blkptr;
999+
1000+
zgd->zgd_db = db;
1001+
zgd->zgd_bp = bp;
1002+
1003+
ASSERT(db != NULL);
1004+
ASSERT(db->db_offset == offset);
1005+
ASSERT(db->db_size == size);
1006+
1007+
error = dmu_sync(zio, lr->lr_common.lrc_txg,
1008+
zvol_get_done, zgd);
1009+
1010+
if (error == 0)
1011+
return (0);
1012+
}
1013+
}
1014+
1015+
zvol_get_done(zgd, error);
1016+
1017+
return (SET_ERROR(error));
1018+
}
1019+
9331020
static MAKE_REQUEST_FN_RET
9341021
zvol_request(struct request_queue *q, struct bio *bio)
9351022
{
@@ -965,6 +1052,23 @@ zvol_request(struct request_queue *q, struct bio *bio)
9651052
*/
9661053
rw_enter(&zv->zv_suspend_lock, RW_READER);
9671054

1055+
/*
1056+
* Open a ZIL if this is the first time we have written to this
1057+
* zvol. We protect zv->zv_zilog with zv_suspend_lock rather
1058+
* than zv_state_lock so that we don't need to acquire an
1059+
* additional lock in this path.
1060+
*/
1061+
if (zv->zv_zilog == NULL) {
1062+
rw_exit(&zv->zv_suspend_lock);
1063+
rw_enter(&zv->zv_suspend_lock, RW_WRITER);
1064+
if (zv->zv_zilog == NULL) {
1065+
zv->zv_zilog = zil_open(zv->zv_objset,
1066+
zvol_get_data);
1067+
zv->zv_flags |= ZVOL_WRITTEN_TO;
1068+
}
1069+
rw_downgrade(&zv->zv_suspend_lock);
1070+
}
1071+
9681072
/* bio marked as FLUSH need to flush before write */
9691073
if (bio_is_flush(bio))
9701074
zil_commit(zv->zv_zilog, ZVOL_OBJ);
@@ -1040,86 +1144,6 @@ zvol_request(struct request_queue *q, struct bio *bio)
10401144
#endif
10411145
}
10421146

1043-
/* ARGSUSED */
1044-
static void
1045-
zvol_get_done(zgd_t *zgd, int error)
1046-
{
1047-
if (zgd->zgd_db)
1048-
dmu_buf_rele(zgd->zgd_db, zgd);
1049-
1050-
rangelock_exit(zgd->zgd_lr);
1051-
1052-
kmem_free(zgd, sizeof (zgd_t));
1053-
}
1054-
1055-
/*
1056-
* Get data to generate a TX_WRITE intent log record.
1057-
*/
1058-
static int
1059-
zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
1060-
{
1061-
zvol_state_t *zv = arg;
1062-
uint64_t offset = lr->lr_offset;
1063-
uint64_t size = lr->lr_length;
1064-
dmu_buf_t *db;
1065-
zgd_t *zgd;
1066-
int error;
1067-
1068-
ASSERT3P(lwb, !=, NULL);
1069-
ASSERT3P(zio, !=, NULL);
1070-
ASSERT3U(size, !=, 0);
1071-
1072-
zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
1073-
zgd->zgd_lwb = lwb;
1074-
1075-
/*
1076-
* Write records come in two flavors: immediate and indirect.
1077-
* For small writes it's cheaper to store the data with the
1078-
* log record (immediate); for large writes it's cheaper to
1079-
* sync the data and get a pointer to it (indirect) so that
1080-
* we don't have to write the data twice.
1081-
*/
1082-
if (buf != NULL) { /* immediate write */
1083-
zgd->zgd_lr = rangelock_enter(&zv->zv_rangelock, offset, size,
1084-
RL_READER);
1085-
error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf,
1086-
DMU_READ_NO_PREFETCH);
1087-
} else { /* indirect write */
1088-
/*
1089-
* Have to lock the whole block to ensure when it's written out
1090-
* and its checksum is being calculated that no one can change
1091-
* the data. Contrarily to zfs_get_data we need not re-check
1092-
* blocksize after we get the lock because it cannot be changed.
1093-
*/
1094-
size = zv->zv_volblocksize;
1095-
offset = P2ALIGN_TYPED(offset, size, uint64_t);
1096-
zgd->zgd_lr = rangelock_enter(&zv->zv_rangelock, offset, size,
1097-
RL_READER);
1098-
error = dmu_buf_hold_by_dnode(zv->zv_dn, offset, zgd, &db,
1099-
DMU_READ_NO_PREFETCH);
1100-
if (error == 0) {
1101-
blkptr_t *bp = &lr->lr_blkptr;
1102-
1103-
zgd->zgd_db = db;
1104-
zgd->zgd_bp = bp;
1105-
1106-
ASSERT(db != NULL);
1107-
ASSERT(db->db_offset == offset);
1108-
ASSERT(db->db_size == size);
1109-
1110-
error = dmu_sync(zio, lr->lr_common.lrc_txg,
1111-
zvol_get_done, zgd);
1112-
1113-
if (error == 0)
1114-
return (0);
1115-
}
1116-
}
1117-
1118-
zvol_get_done(zgd, error);
1119-
1120-
return (SET_ERROR(error));
1121-
}
1122-
11231147
/*
11241148
* The zvol_state_t's are inserted into zvol_state_list and zvol_htable.
11251149
*/
@@ -1157,6 +1181,9 @@ zvol_setup_zv(zvol_state_t *zv)
11571181
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
11581182
ASSERT(RW_LOCK_HELD(&zv->zv_suspend_lock));
11591183

1184+
zv->zv_zilog = NULL;
1185+
zv->zv_flags &= ~ZVOL_WRITTEN_TO;
1186+
11601187
error = dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL);
11611188
if (error)
11621189
return (SET_ERROR(error));
@@ -1171,7 +1198,6 @@ zvol_setup_zv(zvol_state_t *zv)
11711198

11721199
set_capacity(zv->zv_disk, volsize >> 9);
11731200
zv->zv_volsize = volsize;
1174-
zv->zv_zilog = zil_open(os, zvol_get_data);
11751201

11761202
if (ro || dmu_objset_is_snapshot(os) ||
11771203
!spa_writeable(dmu_objset_spa(os))) {
@@ -1194,7 +1220,11 @@ zvol_shutdown_zv(zvol_state_t *zv)
11941220
ASSERT(MUTEX_HELD(&zv->zv_state_lock) &&
11951221
RW_LOCK_HELD(&zv->zv_suspend_lock));
11961222

1197-
zil_close(zv->zv_zilog);
1223+
if (zv->zv_flags & ZVOL_WRITTEN_TO) {
1224+
ASSERT(zv->zv_zilog != NULL);
1225+
zil_close(zv->zv_zilog);
1226+
}
1227+
11981228
zv->zv_zilog = NULL;
11991229

12001230
dnode_rele(zv->zv_dn, FTAG);
@@ -1204,7 +1234,7 @@ zvol_shutdown_zv(zvol_state_t *zv)
12041234
* Evict cached data. We must write out any dirty data before
12051235
* disowning the dataset.
12061236
*/
1207-
if (!(zv->zv_flags & ZVOL_RDONLY))
1237+
if (zv->zv_flags & ZVOL_WRITTEN_TO)
12081238
txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
12091239
(void) dmu_objset_evict_dbufs(zv->zv_objset);
12101240
}

0 commit comments

Comments
 (0)