@@ -85,10 +85,7 @@ typedef struct dbuf_stats {
85
85
/*
86
86
* Statistics for Direct I/O.
87
87
*/
88
- kstat_named_t direct_mixed_io_read_wait ;
89
- kstat_named_t direct_mixed_io_write_wait ;
90
88
kstat_named_t direct_sync_wait ;
91
- kstat_named_t direct_undirty ;
92
89
/*
93
90
* Statistics about the dbuf hash table.
94
91
*/
@@ -137,10 +134,7 @@ dbuf_stats_t dbuf_stats = {
137
134
{ "cache_total_evicts" , KSTAT_DATA_UINT64 },
138
135
{ { "cache_levels_N" , KSTAT_DATA_UINT64 } },
139
136
{ { "cache_levels_bytes_N" , KSTAT_DATA_UINT64 } },
140
- { "direct_mixed_io_read_wait" , KSTAT_DATA_UINT64 },
141
- { "direct_mixed_io_write_wait" , KSTAT_DATA_UINT64 },
142
137
{ "direct_sync_wait" , KSTAT_DATA_UINT64 },
143
- { "direct_undirty" , KSTAT_DATA_UINT64 },
144
138
{ "hash_hits" , KSTAT_DATA_UINT64 },
145
139
{ "hash_misses" , KSTAT_DATA_UINT64 },
146
140
{ "hash_collisions" , KSTAT_DATA_UINT64 },
@@ -162,10 +156,7 @@ struct {
162
156
wmsum_t cache_total_evicts ;
163
157
wmsum_t cache_levels [DN_MAX_LEVELS ];
164
158
wmsum_t cache_levels_bytes [DN_MAX_LEVELS ];
165
- wmsum_t direct_mixed_io_read_wait ;
166
- wmsum_t direct_mixed_io_write_wait ;
167
159
wmsum_t direct_sync_wait ;
168
- wmsum_t direct_undirty ;
169
160
wmsum_t hash_hits ;
170
161
wmsum_t hash_misses ;
171
162
wmsum_t hash_collisions ;
@@ -911,14 +902,8 @@ dbuf_kstat_update(kstat_t *ksp, int rw)
911
902
ds -> cache_levels_bytes [i ].value .ui64 =
912
903
wmsum_value (& dbuf_sums .cache_levels_bytes [i ]);
913
904
}
914
- ds -> direct_mixed_io_read_wait .value .ui64 =
915
- wmsum_value (& dbuf_sums .direct_mixed_io_read_wait );
916
- ds -> direct_mixed_io_write_wait .value .ui64 =
917
- wmsum_value (& dbuf_sums .direct_mixed_io_write_wait );
918
905
ds -> direct_sync_wait .value .ui64 =
919
906
wmsum_value (& dbuf_sums .direct_sync_wait );
920
- ds -> direct_undirty .value .ui64 =
921
- wmsum_value (& dbuf_sums .direct_undirty );
922
907
ds -> hash_hits .value .ui64 =
923
908
wmsum_value (& dbuf_sums .hash_hits );
924
909
ds -> hash_misses .value .ui64 =
@@ -1021,10 +1006,7 @@ dbuf_init(void)
1021
1006
wmsum_init (& dbuf_sums .cache_levels [i ], 0 );
1022
1007
wmsum_init (& dbuf_sums .cache_levels_bytes [i ], 0 );
1023
1008
}
1024
- wmsum_init (& dbuf_sums .direct_mixed_io_read_wait , 0 );
1025
- wmsum_init (& dbuf_sums .direct_mixed_io_write_wait , 0 );
1026
1009
wmsum_init (& dbuf_sums .direct_sync_wait , 0 );
1027
- wmsum_init (& dbuf_sums .direct_undirty , 0 );
1028
1010
wmsum_init (& dbuf_sums .hash_hits , 0 );
1029
1011
wmsum_init (& dbuf_sums .hash_misses , 0 );
1030
1012
wmsum_init (& dbuf_sums .hash_collisions , 0 );
@@ -1097,10 +1079,7 @@ dbuf_fini(void)
1097
1079
wmsum_fini (& dbuf_sums .cache_levels [i ]);
1098
1080
wmsum_fini (& dbuf_sums .cache_levels_bytes [i ]);
1099
1081
}
1100
- wmsum_fini (& dbuf_sums .direct_mixed_io_read_wait );
1101
- wmsum_fini (& dbuf_sums .direct_mixed_io_write_wait );
1102
1082
wmsum_fini (& dbuf_sums .direct_sync_wait );
1103
- wmsum_fini (& dbuf_sums .direct_undirty );
1104
1083
wmsum_fini (& dbuf_sums .hash_hits );
1105
1084
wmsum_fini (& dbuf_sums .hash_misses );
1106
1085
wmsum_fini (& dbuf_sums .hash_collisions );
@@ -1271,9 +1250,8 @@ dbuf_clear_data(dmu_buf_impl_t *db)
1271
1250
{
1272
1251
ASSERT (MUTEX_HELD (& db -> db_mtx ));
1273
1252
dbuf_evict_user (db );
1274
- /* Direct I/O writes may have data */
1275
- if (db -> db_buf == NULL )
1276
- db -> db .db_data = NULL ;
1253
+ ASSERT3P (db -> db_buf , = = , NULL );
1254
+ db -> db .db_data = NULL ;
1277
1255
if (db -> db_state != DB_NOFILL ) {
1278
1256
db -> db_state = DB_UNCACHED ;
1279
1257
DTRACE_SET_STATE (db , "clear data" );
@@ -1733,10 +1711,11 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
1733
1711
* have been modified in a previous transaction group before we access them in
1734
1712
* the current active group.
1735
1713
*
1736
- * This function is used in three places: when we are dirtying a buffer for the
1714
+ * This function is used in four places: when we are dirtying a buffer for the
1737
1715
* first time in a txg, when we are freeing a range in a dnode that includes
1738
- * this buffer, and when we are accessing a buffer which was received compressed
1739
- * and later referenced in a WRITE_BYREF record.
1716
+ * this buffer, when doing block cloning or issuing a Direct I/O write with
1717
+ * abuffer, and when we are accessing a buffer which was received compressed and
1718
+ * later referenced in a WRITE_BYREF record.
1740
1719
*
1741
1720
* Note that when we are called from dbuf_free_range() we do not put a hold on
1742
1721
* the buffer, we just traverse the active dbuf list for the dnode.
@@ -2789,93 +2768,6 @@ dmu_buf_is_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
2789
2768
return (dr != NULL );
2790
2769
}
2791
2770
2792
- void
2793
- dmu_buf_direct_mixed_io_wait (dmu_buf_impl_t * db , uint64_t txg , boolean_t read )
2794
- {
2795
- ASSERT (MUTEX_HELD (& db -> db_mtx ));
2796
-
2797
- if (read == B_TRUE ) {
2798
- /*
2799
- * If a buffered read is in process, a Direct I/O read will
2800
- * wait for the buffered I/O to complete.
2801
- */
2802
- ASSERT3U (txg , = = , 0 );
2803
- while (db -> db_state == DB_READ ) {
2804
- DBUF_STAT_BUMP (direct_mixed_io_read_wait );
2805
- cv_wait (& db -> db_changed , & db -> db_mtx );
2806
- }
2807
- } else {
2808
- /*
2809
- * There must be an ARC buf associated with this Direct I/O
2810
- * write otherwise there is no reason to wait for previous
2811
- * dirty records to sync out.
2812
- *
2813
- * The db_state will temporarily be set to DB_CACHED so that
2814
- * that any synchronous writes issued through the ZIL will
2815
- * still be handled properly. In particular, the call to
2816
- * dbuf_read() in dmu_sync_late_arrival() must account for the
2817
- * data still being in the ARC. After waiting here for previous
2818
- * TXGs to sync out, dmu_write_direct_done() will update the
2819
- * db_state.
2820
- */
2821
- ASSERT3P (db -> db_buf , != , NULL );
2822
- ASSERT3U (txg , > , 0 );
2823
- db -> db_mixed_io_dio_wait = TRUE;
2824
- db -> db_state = DB_CACHED ;
2825
- while (dbuf_find_dirty_lte (db , txg ) != NULL ) {
2826
- DBUF_STAT_BUMP (direct_mixed_io_write_wait );
2827
- cv_wait (& db -> db_changed , & db -> db_mtx );
2828
- }
2829
- db -> db_mixed_io_dio_wait = FALSE;
2830
- }
2831
- }
2832
-
2833
- /*
2834
- * Direct I/O writes may need to undirty the open-context dirty record
2835
- * associated with it in the event of an I/O error.
2836
- */
2837
- void
2838
- dmu_buf_undirty (dmu_buf_impl_t * db , dmu_tx_t * tx )
2839
- {
2840
- /*
2841
- * Direct I/O writes always happen in open-context.
2842
- */
2843
- ASSERT (!dmu_tx_is_syncing (tx ));
2844
- ASSERT (MUTEX_HELD (& db -> db_mtx ));
2845
- ASSERT (db -> db_state == DB_NOFILL || db -> db_state == DB_UNCACHED );
2846
-
2847
-
2848
- /*
2849
- * In the event of an I/O error we will handle the metaslab clean up in
2850
- * zio_done(). Also, the dirty record's dr_overridden_by BP is not
2851
- * currently set as that is done in dmu_sync_done(). Since the db_state
2852
- * is still set to DB_NOFILL, dbuf_unoverride() will not be called in
2853
- * dbuf_undirty() and the dirty record's BP will not be added the SPA's
2854
- * spa_free_bplist via zio_free().
2855
- *
2856
- * This function can also be called in the event that a Direct I/O
2857
- * write is overwriting a previous Direct I/O to the same block for
2858
- * this TXG. It is important to go ahead and free up the space
2859
- * accounting in this case through dbuf_undirty() -> dbuf_unoverride()
2860
- * -> zio_free(). This is necessary because the space accounting for
2861
- * determining if a write can occur in zfs_write() happens through
2862
- * dmu_tx_assign(). This can cause an issue with Direct I/O writes in
2863
- * the case of overwrites, because all DVA allocations are being done
2864
- * in open-context. Constanstly allowing Direct I/O overwrites to the
2865
- * same blocks can exhaust the pools available space leading to ENOSPC
2866
- * errors at the DVA allcoation part of the ZIO pipeline, which will
2867
- * eventually suspend the pool. By cleaning up space accounting now
2868
- * the ENOSPC pool suspend can be avoided.
2869
- *
2870
- * Since we are undirtying the record for the Direct I/O in
2871
- * open-context we must have a hold on the db, so it should never be
2872
- * evicted after calling dbuf_undirty().
2873
- */
2874
- VERIFY3B (dbuf_undirty (db , tx ), = = , B_FALSE );
2875
-
2876
- DBUF_STAT_BUMP (direct_undirty );
2877
- }
2878
-
2879
2771
/*
2880
2772
* Normally the db_blkptr points to the most recent on-disk content for the
2881
2773
* dbuf (and anything newer will be cached in the dbuf). However, a recent
@@ -2951,27 +2843,71 @@ dmu_buf_untransform_direct(dmu_buf_impl_t *db, spa_t *spa)
2951
2843
}
2952
2844
2953
2845
void
2954
- dmu_buf_will_clone (dmu_buf_t * db_fake , dmu_tx_t * tx )
2846
+ dmu_buf_will_clone_or_dio (dmu_buf_t * db_fake , dmu_tx_t * tx )
2955
2847
{
2956
2848
dmu_buf_impl_t * db = (dmu_buf_impl_t * )db_fake ;
2957
2849
2958
2850
/*
2959
- * Block cloning: We are going to clone into this block, so undirty
2960
- * modifications done to this block so far in this txg. This includes
2961
- * writes and clones into this block.
2851
+ * Block clones and Direct I/O writes always happen in open-context.
2962
2852
*/
2853
+ ASSERT (!dmu_tx_is_syncing (tx ));
2854
+ ASSERT (db -> db_level == 0 );
2855
+ ASSERT (db -> db_blkid != DMU_BONUS_BLKID );
2856
+ ASSERT (db -> db .db_object != DMU_META_DNODE_OBJECT );
2857
+
2963
2858
mutex_enter (& db -> db_mtx );
2964
2859
DBUF_VERIFY (db );
2965
- VERIFY (!dbuf_undirty (db , tx ));
2860
+
2861
+ /*
2862
+ * We are going to clone or issue a Direct I/O write on this block, so
2863
+ * undirty modifications done to this block so far in this txg. This
2864
+ * includes writes and clones into this block.
2865
+ *
2866
+ * If there dirty record associated with this txg from a previous Direct
2867
+ * I/O write then space accounting cleanup takes place. It is important
2868
+ * to go ahead free up the space accounting through dbuf_undirty() ->
2869
+ * dbuf_unoverride() -> zio_free(). Space accountiung for determining
2870
+ * if a write can occur in zfs_write() happens through dmu_tx_assign().
2871
+ * This can cuase an issue with Direct I/O writes in the case of
2872
+ * overwriting the same block, because all DVA allocations are being
2873
+ * done in open-context. Constantly allowing Direct I/O overwrites to
2874
+ * the same block can exhaust the pools available space leading to
2875
+ * ENOSPC errors at the DVA allocation part of the ZIO pipeline, which
2876
+ * will eventually suspend the pool. By cleaning up sapce acccounting
2877
+ * now, the ENOSPC error can be avoided.
2878
+ *
2879
+ * Since we are undirtying the record in open-context, we must have a
2880
+ * hold on the db, so it should never be evicted after calling
2881
+ * dbuf_undirty().
2882
+ */
2883
+ VERIFY3B (dbuf_undirty (db , tx ), = = , B_FALSE );
2966
2884
ASSERT0P (dbuf_find_dirty_eq (db , tx -> tx_txg ));
2885
+
2967
2886
if (db -> db_buf != NULL ) {
2968
- arc_buf_destroy (db -> db_buf , db );
2887
+ /*
2888
+ * If there is an associated ARC buffer with this dbuf we can
2889
+ * only destroy it if the previous dirty record does not
2890
+ * reference it.
2891
+ */
2892
+ dbuf_dirty_record_t * dr = list_head (& db -> db_dirty_records );
2893
+ if (dr == NULL || dr -> dt .dl .dr_data != db -> db_buf )
2894
+ arc_buf_destroy (db -> db_buf , db );
2895
+
2896
+ /*
2897
+ * Setting the dbuf's data pointers to NULL will force all
2898
+ * future reads down to the devices to get the most up to date
2899
+ * version of the data after a Direct I/O write has completed.
2900
+ */
2969
2901
db -> db_buf = NULL ;
2970
2902
dbuf_clear_data (db );
2971
2903
}
2972
2904
2905
+ ASSERT3P (db -> db_buf , = = , NULL );
2906
+ ASSERT3P (db -> db .db_data , = = , NULL );
2907
+
2973
2908
db -> db_state = DB_NOFILL ;
2974
- DTRACE_SET_STATE (db , "allocating NOFILL buffer for clone" );
2909
+ DTRACE_SET_STATE (db ,
2910
+ "allocating NOFILL buffer for clone or direct I/O write" );
2975
2911
2976
2912
DBUF_VERIFY (db );
2977
2913
mutex_exit (& db -> db_mtx );
@@ -3510,7 +3446,6 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
3510
3446
db -> db_user_immediate_evict = FALSE;
3511
3447
db -> db_freed_in_flight = FALSE;
3512
3448
db -> db_pending_evict = FALSE;
3513
- db -> db_mixed_io_dio_wait = FALSE;
3514
3449
3515
3450
if (blkid == DMU_BONUS_BLKID ) {
3516
3451
ASSERT3P (parent , = = , dn -> dn_dbuf );
@@ -4766,25 +4701,6 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
4766
4701
dprintf_dbuf_bp (db , db -> db_blkptr , "blkptr=%p" , db -> db_blkptr );
4767
4702
4768
4703
mutex_enter (& db -> db_mtx );
4769
-
4770
- /*
4771
- * It is possible a buffered read has come in after a Direct I/O
4772
- * write and is currently transistioning the db_state from DB_READ
4773
- * in dbuf_read_impl() to another state in dbuf_read_done(). We
4774
- * have to wait in order for the dbuf state to change from DB_READ
4775
- * before syncing the dirty record of the Direct I/O write.
4776
- */
4777
- if (db -> db_state == DB_READ && !dr -> dt .dl .dr_brtwrite ) {
4778
- ASSERT3P (* datap , = = , NULL );
4779
- ASSERT3P (db -> db_buf , = = , NULL );
4780
- ASSERT3P (db -> db .db_data , = = , NULL );
4781
- ASSERT3U (dr -> dt .dl .dr_override_state , = = , DR_OVERRIDDEN );
4782
- while (db -> db_state == DB_READ ) {
4783
- DBUF_STAT_BUMP (direct_sync_wait );
4784
- cv_wait (& db -> db_changed , & db -> db_mtx );
4785
- }
4786
- }
4787
-
4788
4704
/*
4789
4705
* To be synced, we must be dirtied. But we might have been freed
4790
4706
* after the dirty.
@@ -4797,13 +4713,21 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
4797
4713
ASSERT (db -> db .db_data != dr -> dt .dl .dr_data );
4798
4714
} else if (db -> db_state == DB_READ ) {
4799
4715
/*
4800
- * This buffer has a clone we need to write, and an in-flight
4801
- * read on the BP we're about to clone. Its safe to issue the
4802
- * write here because the read has already been issued and the
4803
- * contents won't change.
4716
+ * This buffer was either cloned or had a Direct I/O write
4717
+ * occur and has an in-flgiht read on the BP. It is safe to
4718
+ * issue the write here, because the read has already been
4719
+ * issued and the contents won't change.
4720
+ *
4721
+ * We can verify the case of both the clone and Direct I/O
4722
+ * write by making sure the first dirty record for the dbuf
4723
+ * has no ARC buffer associated with it.
4804
4724
*/
4805
- ASSERT (dr -> dt .dl .dr_brtwrite &&
4806
- dr -> dt .dl .dr_override_state == DR_OVERRIDDEN );
4725
+ dbuf_dirty_record_t * dr_head =
4726
+ list_head (& db -> db_dirty_records );
4727
+ ASSERT3P (db -> db_buf , = = , NULL );
4728
+ ASSERT3P (db -> db .db_data , = = , NULL );
4729
+ ASSERT3P (dr_head -> dt .dl .dr_data , = = , NULL );
4730
+ ASSERT3U (dr_head -> dt .dl .dr_override_state , = = , DR_OVERRIDDEN );
4807
4731
} else {
4808
4732
ASSERT (db -> db_state == DB_CACHED || db -> db_state == DB_NOFILL );
4809
4733
}
@@ -5503,7 +5427,7 @@ EXPORT_SYMBOL(dbuf_dirty);
5503
5427
EXPORT_SYMBOL (dmu_buf_set_crypt_params );
5504
5428
EXPORT_SYMBOL (dmu_buf_will_dirty );
5505
5429
EXPORT_SYMBOL (dmu_buf_is_dirty );
5506
- EXPORT_SYMBOL (dmu_buf_will_clone );
5430
+ EXPORT_SYMBOL (dmu_buf_will_clone_or_dio );
5507
5431
EXPORT_SYMBOL (dmu_buf_will_not_fill );
5508
5432
EXPORT_SYMBOL (dmu_buf_will_fill );
5509
5433
EXPORT_SYMBOL (dmu_buf_fill_done );
0 commit comments