36
36
*
37
37
* Copyright 2014 Nexenta Systems, Inc. All rights reserved.
38
38
* Copyright (c) 2016 Actifio, Inc. All rights reserved.
39
- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
39
+ * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
40
40
*/
41
41
42
42
/*
@@ -155,6 +155,11 @@ typedef struct {
155
155
} zvol_task_t ;
156
156
157
157
#define ZVOL_RDONLY 0x1
158
+ /*
159
+ * Whether the zvol has been written to (as opposed to ZVOL_RDONLY, which
160
+ * specifies whether or not the zvol _can_ be written to)
161
+ */
162
+ #define ZVOL_WRITTEN_TO 0x2
158
163
159
164
static uint64_t
160
165
zvol_name_hash (const char * name )
@@ -742,6 +747,7 @@ zvol_write(void *arg)
742
747
743
748
zvol_state_t * zv = zvr -> zv ;
744
749
ASSERT (zv && zv -> zv_open_count > 0 );
750
+ ASSERT (zv -> zv_zilog != NULL );
745
751
746
752
ssize_t start_resid = uio .uio_resid ;
747
753
unsigned long start_jif = jiffies ;
@@ -832,6 +838,7 @@ zvol_discard(void *arg)
832
838
unsigned long start_jif ;
833
839
834
840
ASSERT (zv && zv -> zv_open_count > 0 );
841
+ ASSERT (zv -> zv_zilog != NULL );
835
842
836
843
start_jif = jiffies ;
837
844
blk_generic_start_io_acct (zv -> zv_queue , WRITE , bio_sectors (bio ),
@@ -930,6 +937,86 @@ zvol_read(void *arg)
930
937
kmem_free (zvr , sizeof (zv_request_t ));
931
938
}
932
939
940
+ /* ARGSUSED */
941
+ static void
942
+ zvol_get_done (zgd_t * zgd , int error )
943
+ {
944
+ if (zgd -> zgd_db )
945
+ dmu_buf_rele (zgd -> zgd_db , zgd );
946
+
947
+ rangelock_exit (zgd -> zgd_lr );
948
+
949
+ kmem_free (zgd , sizeof (zgd_t ));
950
+ }
951
+
952
+ /*
953
+ * Get data to generate a TX_WRITE intent log record.
954
+ */
955
+ static int
956
+ zvol_get_data (void * arg , lr_write_t * lr , char * buf , struct lwb * lwb , zio_t * zio )
957
+ {
958
+ zvol_state_t * zv = arg ;
959
+ uint64_t offset = lr -> lr_offset ;
960
+ uint64_t size = lr -> lr_length ;
961
+ dmu_buf_t * db ;
962
+ zgd_t * zgd ;
963
+ int error ;
964
+
965
+ ASSERT3P (lwb , != , NULL );
966
+ ASSERT3P (zio , != , NULL );
967
+ ASSERT3U (size , != , 0 );
968
+
969
+ zgd = (zgd_t * )kmem_zalloc (sizeof (zgd_t ), KM_SLEEP );
970
+ zgd -> zgd_lwb = lwb ;
971
+
972
+ /*
973
+ * Write records come in two flavors: immediate and indirect.
974
+ * For small writes it's cheaper to store the data with the
975
+ * log record (immediate); for large writes it's cheaper to
976
+ * sync the data and get a pointer to it (indirect) so that
977
+ * we don't have to write the data twice.
978
+ */
979
+ if (buf != NULL ) { /* immediate write */
980
+ zgd -> zgd_lr = rangelock_enter (& zv -> zv_rangelock , offset , size ,
981
+ RL_READER );
982
+ error = dmu_read_by_dnode (zv -> zv_dn , offset , size , buf ,
983
+ DMU_READ_NO_PREFETCH );
984
+ } else { /* indirect write */
985
+ /*
986
+ * Have to lock the whole block to ensure when it's written out
987
+ * and its checksum is being calculated that no one can change
988
+ * the data. Contrarily to zfs_get_data we need not re-check
989
+ * blocksize after we get the lock because it cannot be changed.
990
+ */
991
+ size = zv -> zv_volblocksize ;
992
+ offset = P2ALIGN_TYPED (offset , size , uint64_t );
993
+ zgd -> zgd_lr = rangelock_enter (& zv -> zv_rangelock , offset , size ,
994
+ RL_READER );
995
+ error = dmu_buf_hold_by_dnode (zv -> zv_dn , offset , zgd , & db ,
996
+ DMU_READ_NO_PREFETCH );
997
+ if (error == 0 ) {
998
+ blkptr_t * bp = & lr -> lr_blkptr ;
999
+
1000
+ zgd -> zgd_db = db ;
1001
+ zgd -> zgd_bp = bp ;
1002
+
1003
+ ASSERT (db != NULL );
1004
+ ASSERT (db -> db_offset == offset );
1005
+ ASSERT (db -> db_size == size );
1006
+
1007
+ error = dmu_sync (zio , lr -> lr_common .lrc_txg ,
1008
+ zvol_get_done , zgd );
1009
+
1010
+ if (error == 0 )
1011
+ return (0 );
1012
+ }
1013
+ }
1014
+
1015
+ zvol_get_done (zgd , error );
1016
+
1017
+ return (SET_ERROR (error ));
1018
+ }
1019
+
933
1020
static MAKE_REQUEST_FN_RET
934
1021
zvol_request (struct request_queue * q , struct bio * bio )
935
1022
{
@@ -965,6 +1052,23 @@ zvol_request(struct request_queue *q, struct bio *bio)
965
1052
*/
966
1053
rw_enter (& zv -> zv_suspend_lock , RW_READER );
967
1054
1055
+ /*
1056
+ * Open a ZIL if this is the first time we have written to this
1057
+ * zvol. We protect zv->zv_zilog with zv_suspend_lock rather
1058
+ * than zv_state_lock so that we don't need to acquire an
1059
+ * additional lock in this path.
1060
+ */
1061
+ if (zv -> zv_zilog == NULL ) {
1062
+ rw_exit (& zv -> zv_suspend_lock );
1063
+ rw_enter (& zv -> zv_suspend_lock , RW_WRITER );
1064
+ if (zv -> zv_zilog == NULL ) {
1065
+ zv -> zv_zilog = zil_open (zv -> zv_objset ,
1066
+ zvol_get_data );
1067
+ zv -> zv_flags |= ZVOL_WRITTEN_TO ;
1068
+ }
1069
+ rw_downgrade (& zv -> zv_suspend_lock );
1070
+ }
1071
+
968
1072
/* bio marked as FLUSH need to flush before write */
969
1073
if (bio_is_flush (bio ))
970
1074
zil_commit (zv -> zv_zilog , ZVOL_OBJ );
@@ -1040,86 +1144,6 @@ zvol_request(struct request_queue *q, struct bio *bio)
1040
1144
#endif
1041
1145
}
1042
1146
1043
- /* ARGSUSED */
1044
- static void
1045
- zvol_get_done (zgd_t * zgd , int error )
1046
- {
1047
- if (zgd -> zgd_db )
1048
- dmu_buf_rele (zgd -> zgd_db , zgd );
1049
-
1050
- rangelock_exit (zgd -> zgd_lr );
1051
-
1052
- kmem_free (zgd , sizeof (zgd_t ));
1053
- }
1054
-
1055
- /*
1056
- * Get data to generate a TX_WRITE intent log record.
1057
- */
1058
- static int
1059
- zvol_get_data (void * arg , lr_write_t * lr , char * buf , struct lwb * lwb , zio_t * zio )
1060
- {
1061
- zvol_state_t * zv = arg ;
1062
- uint64_t offset = lr -> lr_offset ;
1063
- uint64_t size = lr -> lr_length ;
1064
- dmu_buf_t * db ;
1065
- zgd_t * zgd ;
1066
- int error ;
1067
-
1068
- ASSERT3P (lwb , != , NULL );
1069
- ASSERT3P (zio , != , NULL );
1070
- ASSERT3U (size , != , 0 );
1071
-
1072
- zgd = (zgd_t * )kmem_zalloc (sizeof (zgd_t ), KM_SLEEP );
1073
- zgd -> zgd_lwb = lwb ;
1074
-
1075
- /*
1076
- * Write records come in two flavors: immediate and indirect.
1077
- * For small writes it's cheaper to store the data with the
1078
- * log record (immediate); for large writes it's cheaper to
1079
- * sync the data and get a pointer to it (indirect) so that
1080
- * we don't have to write the data twice.
1081
- */
1082
- if (buf != NULL ) { /* immediate write */
1083
- zgd -> zgd_lr = rangelock_enter (& zv -> zv_rangelock , offset , size ,
1084
- RL_READER );
1085
- error = dmu_read_by_dnode (zv -> zv_dn , offset , size , buf ,
1086
- DMU_READ_NO_PREFETCH );
1087
- } else { /* indirect write */
1088
- /*
1089
- * Have to lock the whole block to ensure when it's written out
1090
- * and its checksum is being calculated that no one can change
1091
- * the data. Contrarily to zfs_get_data we need not re-check
1092
- * blocksize after we get the lock because it cannot be changed.
1093
- */
1094
- size = zv -> zv_volblocksize ;
1095
- offset = P2ALIGN_TYPED (offset , size , uint64_t );
1096
- zgd -> zgd_lr = rangelock_enter (& zv -> zv_rangelock , offset , size ,
1097
- RL_READER );
1098
- error = dmu_buf_hold_by_dnode (zv -> zv_dn , offset , zgd , & db ,
1099
- DMU_READ_NO_PREFETCH );
1100
- if (error == 0 ) {
1101
- blkptr_t * bp = & lr -> lr_blkptr ;
1102
-
1103
- zgd -> zgd_db = db ;
1104
- zgd -> zgd_bp = bp ;
1105
-
1106
- ASSERT (db != NULL );
1107
- ASSERT (db -> db_offset == offset );
1108
- ASSERT (db -> db_size == size );
1109
-
1110
- error = dmu_sync (zio , lr -> lr_common .lrc_txg ,
1111
- zvol_get_done , zgd );
1112
-
1113
- if (error == 0 )
1114
- return (0 );
1115
- }
1116
- }
1117
-
1118
- zvol_get_done (zgd , error );
1119
-
1120
- return (SET_ERROR (error ));
1121
- }
1122
-
1123
1147
/*
1124
1148
* The zvol_state_t's are inserted into zvol_state_list and zvol_htable.
1125
1149
*/
@@ -1157,6 +1181,9 @@ zvol_setup_zv(zvol_state_t *zv)
1157
1181
ASSERT (MUTEX_HELD (& zv -> zv_state_lock ));
1158
1182
ASSERT (RW_LOCK_HELD (& zv -> zv_suspend_lock ));
1159
1183
1184
+ zv -> zv_zilog = NULL ;
1185
+ zv -> zv_flags &= ~ZVOL_WRITTEN_TO ;
1186
+
1160
1187
error = dsl_prop_get_integer (zv -> zv_name , "readonly" , & ro , NULL );
1161
1188
if (error )
1162
1189
return (SET_ERROR (error ));
@@ -1171,7 +1198,6 @@ zvol_setup_zv(zvol_state_t *zv)
1171
1198
1172
1199
set_capacity (zv -> zv_disk , volsize >> 9 );
1173
1200
zv -> zv_volsize = volsize ;
1174
- zv -> zv_zilog = zil_open (os , zvol_get_data );
1175
1201
1176
1202
if (ro || dmu_objset_is_snapshot (os ) ||
1177
1203
!spa_writeable (dmu_objset_spa (os ))) {
@@ -1194,7 +1220,11 @@ zvol_shutdown_zv(zvol_state_t *zv)
1194
1220
ASSERT (MUTEX_HELD (& zv -> zv_state_lock ) &&
1195
1221
RW_LOCK_HELD (& zv -> zv_suspend_lock ));
1196
1222
1197
- zil_close (zv -> zv_zilog );
1223
+ if (zv -> zv_flags & ZVOL_WRITTEN_TO ) {
1224
+ ASSERT (zv -> zv_zilog != NULL );
1225
+ zil_close (zv -> zv_zilog );
1226
+ }
1227
+
1198
1228
zv -> zv_zilog = NULL ;
1199
1229
1200
1230
dnode_rele (zv -> zv_dn , FTAG );
@@ -1204,7 +1234,7 @@ zvol_shutdown_zv(zvol_state_t *zv)
1204
1234
* Evict cached data. We must write out any dirty data before
1205
1235
* disowning the dataset.
1206
1236
*/
1207
- if (!( zv -> zv_flags & ZVOL_RDONLY ) )
1237
+ if (zv -> zv_flags & ZVOL_WRITTEN_TO )
1208
1238
txg_wait_synced (dmu_objset_pool (zv -> zv_objset ), 0 );
1209
1239
(void ) dmu_objset_evict_dbufs (zv -> zv_objset );
1210
1240
}
0 commit comments