Skip to content

Commit d4d7945

Browse files
don-bradybehlendorf
authored andcommitted
Add DDT prune command
Requires the new 'flat' physical data which has the start time for a class entry. The amount to prune can be based on a target percentage of the unique entries or based on the age (i.e., every entry older than N days). Sponsored-by: Klara, Inc. Sponsored-by: iXsystems, Inc. Reviewed-by: Alexander Motin <[email protected]> Reviewed-by: Brian Behlendorf <[email protected]> Signed-off-by: Don Brady <[email protected]> Closes #16277
1 parent 4a4f7b0 commit d4d7945

File tree

21 files changed

+905
-85
lines changed

21 files changed

+905
-85
lines changed

cmd/zdb/zdb.c

Lines changed: 45 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2045,7 +2045,7 @@ dump_all_ddts(spa_t *spa)
20452045

20462046
for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
20472047
ddt_t *ddt = spa->spa_ddt[c];
2048-
if (!ddt)
2048+
if (!ddt || ddt->ddt_version == DDT_VERSION_UNCONFIGURED)
20492049
continue;
20502050
for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
20512051
for (ddt_class_t class = 0; class < DDT_CLASSES;
@@ -2072,6 +2072,32 @@ dump_all_ddts(spa_t *spa)
20722072
}
20732073

20742074
dump_dedup_ratio(&dds_total);
2075+
2076+
/*
2077+
* Dump a histogram of unique class entry age
2078+
*/
2079+
if (dump_opt['D'] == 3 && getenv("ZDB_DDT_UNIQUE_AGE_HIST") != NULL) {
2080+
ddt_age_histo_t histogram;
2081+
2082+
(void) printf("DDT walk unique, building age histogram...\n");
2083+
ddt_prune_walk(spa, 0, &histogram);
2084+
2085+
/*
2086+
* print out histogram for unique entry class birth
2087+
*/
2088+
if (histogram.dah_entries > 0) {
2089+
(void) printf("%5s %9s %4s\n",
2090+
"age", "blocks", "amnt");
2091+
(void) printf("%5s %9s %4s\n",
2092+
"-----", "---------", "----");
2093+
for (int i = 0; i < HIST_BINS; i++) {
2094+
(void) printf("%5d %9d %4d%%\n", 1 << i,
2095+
(int)histogram.dah_age_histo[i],
2096+
(int)((histogram.dah_age_histo[i] * 100) /
2097+
histogram.dah_entries));
2098+
}
2099+
}
2100+
}
20752101
}
20762102

20772103
static void
@@ -5749,12 +5775,17 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
57495775
ddt_entry_t *dde = ddt_lookup(ddt, bp);
57505776

57515777
/*
5752-
* ddt_lookup() can only return NULL if this block didn't exist
5778+
* ddt_lookup() can return NULL if this block didn't exist
57535779
* in the DDT and creating it would take the DDT over its
57545780
* quota. Since we got the block from disk, it must exist in
5755-
* the DDT, so this can't happen.
5781+
* the DDT, so this can't happen. However, when unique entries
5782+
* are pruned, the dedup bit can be set with no corresponding
5783+
* entry in the DDT.
57565784
*/
5757-
VERIFY3P(dde, !=, NULL);
5785+
if (dde == NULL) {
5786+
ddt_exit(ddt);
5787+
goto skipped;
5788+
}
57585789

57595790
/* Get the phys for this variant */
57605791
ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp);
@@ -5774,8 +5805,8 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
57745805
(void *)(((uintptr_t)dde->dde_io) | (1 << v));
57755806

57765807
/* Consume a reference for this block. */
5777-
VERIFY3U(ddt_phys_total_refcnt(ddt, dde->dde_phys), >, 0);
5778-
ddt_phys_decref(dde->dde_phys, v);
5808+
if (ddt_phys_total_refcnt(ddt, dde->dde_phys) > 0)
5809+
ddt_phys_decref(dde->dde_phys, v);
57795810

57805811
/*
57815812
* If this entry has a single flat phys, it may have been
@@ -5864,6 +5895,7 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
58645895
}
58655896
}
58665897

5898+
skipped:
58675899
for (i = 0; i < 4; i++) {
58685900
int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL;
58695901
int t = (i & 1) ? type : ZDB_OT_TOTAL;
@@ -8138,7 +8170,7 @@ dump_mos_leaks(spa_t *spa)
81388170

81398171
for (uint64_t c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
81408172
ddt_t *ddt = spa->spa_ddt[c];
8141-
if (!ddt)
8173+
if (!ddt || ddt->ddt_version == DDT_VERSION_UNCONFIGURED)
81428174
continue;
81438175

81448176
/* DDT store objects */
@@ -8150,11 +8182,14 @@ dump_mos_leaks(spa_t *spa)
81508182
}
81518183

81528184
/* FDT container */
8153-
mos_obj_refd(ddt->ddt_dir_object);
8185+
if (ddt->ddt_version == DDT_VERSION_FDT)
8186+
mos_obj_refd(ddt->ddt_dir_object);
81548187

81558188
/* FDT log objects */
8156-
mos_obj_refd(ddt->ddt_log[0].ddl_object);
8157-
mos_obj_refd(ddt->ddt_log[1].ddl_object);
8189+
if (ddt->ddt_flags & DDT_FLAG_LOG) {
8190+
mos_obj_refd(ddt->ddt_log[0].ddl_object);
8191+
mos_obj_refd(ddt->ddt_log[1].ddl_object);
8192+
}
81588193
}
81598194

81608195
if (spa->spa_brt != NULL) {

cmd/zpool/zpool_main.c

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,8 @@ static int zpool_do_version(int, char **);
130130

131131
static int zpool_do_wait(int, char **);
132132

133+
static int zpool_do_ddt_prune(int, char **);
134+
133135
static int zpool_do_help(int argc, char **argv);
134136

135137
static zpool_compat_status_t zpool_do_load_compat(
@@ -170,6 +172,7 @@ typedef enum {
170172
HELP_CLEAR,
171173
HELP_CREATE,
172174
HELP_CHECKPOINT,
175+
HELP_DDT_PRUNE,
173176
HELP_DESTROY,
174177
HELP_DETACH,
175178
HELP_EXPORT,
@@ -426,6 +429,8 @@ static zpool_command_t command_table[] = {
426429
{ "sync", zpool_do_sync, HELP_SYNC },
427430
{ NULL },
428431
{ "wait", zpool_do_wait, HELP_WAIT },
432+
{ NULL },
433+
{ "ddtprune", zpool_do_ddt_prune, HELP_DDT_PRUNE },
429434
};
430435

431436
#define NCOMMAND (ARRAY_SIZE(command_table))
@@ -545,6 +550,8 @@ get_usage(zpool_help_t idx)
545550
case HELP_WAIT:
546551
return (gettext("\twait [-Hp] [-T d|u] [-t <activity>[,...]] "
547552
"<pool> [interval]\n"));
553+
case HELP_DDT_PRUNE:
554+
return (gettext("\tddtprune -d|-p <amount> <pool>\n"));
548555
default:
549556
__builtin_unreachable();
550557
}
@@ -13342,6 +13349,88 @@ found:;
1334213349
return (error);
1334313350
}
1334413351

13352+
/*
13353+
* zpool ddtprune -d|-p <amount> <pool>
13354+
*
13355+
* -d <days> Prune entries <days> old and older
13356+
* -p <percent> Prune <percent> amount of entries
13357+
*
13358+
* Prune single reference entries from DDT to satisfy the amount specified.
13359+
*/
13360+
int
13361+
zpool_do_ddt_prune(int argc, char **argv)
13362+
{
13363+
zpool_ddt_prune_unit_t unit = ZPOOL_DDT_PRUNE_NONE;
13364+
uint64_t amount = 0;
13365+
zpool_handle_t *zhp;
13366+
char *endptr;
13367+
int c;
13368+
13369+
while ((c = getopt(argc, argv, "d:p:")) != -1) {
13370+
switch (c) {
13371+
case 'd':
13372+
if (unit == ZPOOL_DDT_PRUNE_PERCENTAGE) {
13373+
(void) fprintf(stderr, gettext("-d cannot be "
13374+
"combined with -p option\n"));
13375+
usage(B_FALSE);
13376+
}
13377+
errno = 0;
13378+
amount = strtoull(optarg, &endptr, 0);
13379+
if (errno != 0 || *endptr != '\0' || amount == 0) {
13380+
(void) fprintf(stderr,
13381+
gettext("invalid days value\n"));
13382+
usage(B_FALSE);
13383+
}
13384+
amount *= 86400; /* convert days to seconds */
13385+
unit = ZPOOL_DDT_PRUNE_AGE;
13386+
break;
13387+
case 'p':
13388+
if (unit == ZPOOL_DDT_PRUNE_AGE) {
13389+
(void) fprintf(stderr, gettext("-p cannot be "
13390+
"combined with -d option\n"));
13391+
usage(B_FALSE);
13392+
}
13393+
errno = 0;
13394+
amount = strtoull(optarg, &endptr, 0);
13395+
if (errno != 0 || *endptr != '\0' ||
13396+
amount == 0 || amount > 100) {
13397+
(void) fprintf(stderr,
13398+
gettext("invalid percentage value\n"));
13399+
usage(B_FALSE);
13400+
}
13401+
unit = ZPOOL_DDT_PRUNE_PERCENTAGE;
13402+
break;
13403+
case '?':
13404+
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
13405+
optopt);
13406+
usage(B_FALSE);
13407+
}
13408+
}
13409+
argc -= optind;
13410+
argv += optind;
13411+
13412+
if (unit == ZPOOL_DDT_PRUNE_NONE) {
13413+
(void) fprintf(stderr,
13414+
gettext("missing amount option (-d|-p <value>)\n"));
13415+
usage(B_FALSE);
13416+
} else if (argc < 1) {
13417+
(void) fprintf(stderr, gettext("missing pool argument\n"));
13418+
usage(B_FALSE);
13419+
} else if (argc > 1) {
13420+
(void) fprintf(stderr, gettext("too many arguments\n"));
13421+
usage(B_FALSE);
13422+
}
13423+
zhp = zpool_open(g_zfs, argv[0]);
13424+
if (zhp == NULL)
13425+
return (-1);
13426+
13427+
int error = zpool_ddt_prune(zhp, unit, amount);
13428+
13429+
zpool_close(zhp);
13430+
13431+
return (error);
13432+
}
13433+
1334513434
static int
1334613435
find_command_idx(const char *command, int *idx)
1334713436
{

cmd/ztest.c

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -276,6 +276,8 @@ extern unsigned long zio_decompress_fail_fraction;
276276
extern unsigned long zfs_reconstruct_indirect_damage_fraction;
277277
extern uint64_t raidz_expand_max_reflow_bytes;
278278
extern uint_t raidz_expand_pause_point;
279+
extern boolean_t ddt_prune_artificial_age;
280+
extern boolean_t ddt_dump_prune_histogram;
279281

280282

281283
static ztest_shared_opts_t *ztest_shared_opts;
@@ -446,6 +448,7 @@ ztest_func_t ztest_fletcher;
446448
ztest_func_t ztest_fletcher_incr;
447449
ztest_func_t ztest_verify_dnode_bt;
448450
ztest_func_t ztest_pool_prefetch_ddt;
451+
ztest_func_t ztest_ddt_prune;
449452

450453
static uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */
451454
static uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */
@@ -502,6 +505,7 @@ static ztest_info_t ztest_info[] = {
502505
ZTI_INIT(ztest_fletcher_incr, 1, &zopt_rarely),
503506
ZTI_INIT(ztest_verify_dnode_bt, 1, &zopt_sometimes),
504507
ZTI_INIT(ztest_pool_prefetch_ddt, 1, &zopt_rarely),
508+
ZTI_INIT(ztest_ddt_prune, 1, &zopt_rarely),
505509
};
506510

507511
#define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t))
@@ -7288,6 +7292,17 @@ ztest_trim(ztest_ds_t *zd, uint64_t id)
72887292
mutex_exit(&ztest_vdev_lock);
72897293
}
72907294

7295+
void
7296+
ztest_ddt_prune(ztest_ds_t *zd, uint64_t id)
7297+
{
7298+
(void) zd, (void) id;
7299+
7300+
spa_t *spa = ztest_spa;
7301+
uint64_t pct = ztest_random(15) + 1;
7302+
7303+
(void) ddt_prune_unique_entries(spa, ZPOOL_DDT_PRUNE_PERCENTAGE, pct);
7304+
}
7305+
72917306
/*
72927307
* Verify pool integrity by running zdb.
72937308
*/
@@ -7469,6 +7484,13 @@ ztest_resume_thread(void *arg)
74697484
{
74707485
spa_t *spa = arg;
74717486

7487+
/*
7488+
* Synthesize aged DDT entries for ddt prune testing
7489+
*/
7490+
ddt_prune_artificial_age = B_TRUE;
7491+
if (ztest_opts.zo_verbose >= 3)
7492+
ddt_dump_prune_histogram = B_TRUE;
7493+
74727494
while (!ztest_exiting) {
74737495
if (spa_suspended(spa))
74747496
ztest_resume(spa);
@@ -8587,6 +8609,12 @@ ztest_init(ztest_shared_t *zs)
85878609
if (i == SPA_FEATURE_LOG_SPACEMAP && ztest_random(4) == 0)
85888610
continue;
85898611

8612+
/*
8613+
* split 50/50 between legacy and fast dedup
8614+
*/
8615+
if (i == SPA_FEATURE_FAST_DEDUP && ztest_random(2) != 0)
8616+
continue;
8617+
85908618
VERIFY3S(-1, !=, asprintf(&buf, "feature@%s",
85918619
spa_feature_table[i].fi_uname));
85928620
fnvlist_add_uint64(props, buf, 0);

contrib/debian/openzfs-zfsutils.install

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,7 @@ usr/share/man/man8/zpool-clear.8
100100
usr/share/man/man8/zpool-create.8
101101
usr/share/man/man8/zpool-destroy.8
102102
usr/share/man/man8/zpool-detach.8
103+
usr/share/man/man8/zpool-ddtprune.8
103104
usr/share/man/man8/zpool-events.8
104105
usr/share/man/man8/zpool-export.8
105106
usr/share/man/man8/zpool-get.8

include/libzfs.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -305,6 +305,9 @@ _LIBZFS_H int zpool_reopen_one(zpool_handle_t *, void *);
305305

306306
_LIBZFS_H int zpool_sync_one(zpool_handle_t *, void *);
307307

308+
_LIBZFS_H int zpool_ddt_prune(zpool_handle_t *, zpool_ddt_prune_unit_t,
309+
uint64_t);
310+
308311
_LIBZFS_H int zpool_vdev_online(zpool_handle_t *, const char *, int,
309312
vdev_state_t *);
310313
_LIBZFS_H int zpool_vdev_offline(zpool_handle_t *, const char *, boolean_t);

include/libzfs_core.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,9 @@ _LIBZFS_CORE_H int lzc_set_vdev_prop(const char *, nvlist_t *, nvlist_t **);
161161

162162
_LIBZFS_CORE_H int lzc_scrub(zfs_ioc_t, const char *, nvlist_t *, nvlist_t **);
163163

164+
_LIBZFS_CORE_H int lzc_ddt_prune(const char *, zpool_ddt_prune_unit_t,
165+
uint64_t);
166+
164167
#ifdef __cplusplus
165168
}
166169
#endif

include/sys/ddt.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -405,6 +405,9 @@ extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb,
405405

406406
extern boolean_t ddt_addref(spa_t *spa, const blkptr_t *bp);
407407

408+
extern int ddt_prune_unique_entries(spa_t *spa, zpool_ddt_prune_unit_t unit,
409+
uint64_t amount);
410+
408411
#ifdef __cplusplus
409412
}
410413
#endif

0 commit comments

Comments
 (0)