Skip to content

Commit e044ce2

Browse files
gamanakisskiselkovlundman
committed
Persistent L2ARC
This commit makes the L2ARC persistent across reboots. We implement a light-weight persistent L2ARC metadata structure that allows L2ARC contents to be recovered after a reboot. This significantly eases the impact a reboot has on read performance on systems with large caches. Co-authored-by: Saso Kiselkov <[email protected]> Co-authored-by: Jorgen Lundman <[email protected]> Co-authored-by: George Amanakis <[email protected]> Ported-by: Yuxuan Shui <[email protected]> Signed-off-by: George Amanakis <[email protected]>
1 parent 0929c4d commit e044ce2

30 files changed

+3020
-88
lines changed

cmd/zdb/zdb.c

Lines changed: 228 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@
6262
#include <sys/zio_compress.h>
6363
#include <sys/zfs_fuid.h>
6464
#include <sys/arc.h>
65+
#include <sys/arc_impl.h>
6566
#include <sys/ddt.h>
6667
#include <sys/zfeature.h>
6768
#include <sys/abd.h>
@@ -3474,6 +3475,216 @@ print_label_header(zdb_label_t *label, int l)
34743475
label->header_printed = B_TRUE;
34753476
}
34763477

3478+
static void
3479+
print_l2arc_header(void)
3480+
{
3481+
(void) printf("------------------------------------\n");
3482+
(void) printf("L2ARC device header\n");
3483+
(void) printf("------------------------------------\n");
3484+
}
3485+
3486+
static void
3487+
print_l2arc_log_blocks(void)
3488+
{
3489+
(void) printf("------------------------------------\n");
3490+
(void) printf("L2ARC device log blocks\n");
3491+
(void) printf("------------------------------------\n");
3492+
}
3493+
3494+
static void
3495+
dump_l2arc_log_entries(uint64_t log_entries,
3496+
l2arc_log_ent_phys_t *le, int i)
3497+
{
3498+
for (int j = 0; j < log_entries; j++) {
3499+
dva_t dva = le[j].le_dva;
3500+
(void) printf("lb[%4d]\tle[%4d]\tDVA asize: %llu,"
3501+
"vdev: %llu, offset: %llu\n", i + 1, j + 1,
3502+
(u_longlong_t)DVA_GET_ASIZE(&dva),
3503+
(u_longlong_t)DVA_GET_VDEV(&dva),
3504+
(u_longlong_t)DVA_GET_OFFSET(&dva));
3505+
(void) printf("|\t\t\t\tbirth: %llu\n",
3506+
(u_longlong_t)le[j].le_birth);
3507+
(void) printf("|\t\t\t\tlsize: %llu\n",
3508+
(u_longlong_t)L2BLK_GET_LSIZE((&le[j])->le_prop));
3509+
(void) printf("|\t\t\t\tpsize: %llu\n",
3510+
(u_longlong_t)L2BLK_GET_PSIZE((&le[j])->le_prop));
3511+
(void) printf("|\t\t\t\tcompr: %llu\n",
3512+
(u_longlong_t)L2BLK_GET_COMPRESS((&le[j])->le_prop));
3513+
(void) printf("|\t\t\t\ttype: %llu\n",
3514+
(u_longlong_t)L2BLK_GET_TYPE((&le[j])->le_prop));
3515+
(void) printf("|\t\t\t\tprotected: %llu\n",
3516+
(u_longlong_t)L2BLK_GET_PROTECTED((&le[j])->le_prop));
3517+
(void) printf("|\t\t\t\tprefetch: %llu\n",
3518+
(u_longlong_t)L2BLK_GET_PREFETCH((&le[j])->le_prop));
3519+
(void) printf("|\t\t\t\taddress: %llu\n",
3520+
(u_longlong_t)le[j].le_daddr);
3521+
(void) printf("|\n");
3522+
}
3523+
(void) printf("\n");
3524+
}
3525+
3526+
static void
3527+
dump_l2arc_log_blkptr(l2arc_log_blkptr_t lbps)
3528+
{
3529+
(void) printf("|\t\tdaddr: %llu\n", (u_longlong_t)lbps.lbp_daddr);
3530+
(void) printf("|\t\tpayload_asize: %llu\n",
3531+
(u_longlong_t)lbps.lbp_payload_asize);
3532+
(void) printf("|\t\tpayload_start: %llu\n",
3533+
(u_longlong_t)lbps.lbp_payload_start);
3534+
(void) printf("|\t\tlsize: %llu\n",
3535+
(u_longlong_t)L2BLK_GET_LSIZE((&lbps)->lbp_prop));
3536+
(void) printf("|\t\tpsize: %llu\n",
3537+
(u_longlong_t)L2BLK_GET_PSIZE((&lbps)->lbp_prop));
3538+
(void) printf("|\t\tcompralgo: %llu\n",
3539+
(u_longlong_t)L2BLK_GET_COMPRESS((&lbps)->lbp_prop));
3540+
(void) printf("|\t\tcksumalgo: %llu\n",
3541+
(u_longlong_t)L2BLK_GET_CHECKSUM((&lbps)->lbp_prop));
3542+
(void) printf("|\n\n");
3543+
}
3544+
3545+
static void
3546+
dump_l2arc_log_blocks(int fd, l2arc_dev_hdr_phys_t l2dhdr)
3547+
{
3548+
l2arc_log_blk_phys_t this_lb;
3549+
uint64_t psize;
3550+
l2arc_log_blkptr_t lbps[2];
3551+
abd_t *abd;
3552+
zio_cksum_t cksum;
3553+
int i = 0, failed = 0;
3554+
l2arc_dev_t dev;
3555+
3556+
print_l2arc_log_blocks();
3557+
bcopy((&l2dhdr)->dh_start_lbps, lbps, sizeof (lbps));
3558+
3559+
dev.l2ad_evict = l2dhdr.dh_evict;
3560+
dev.l2ad_start = l2dhdr.dh_start;
3561+
dev.l2ad_end = l2dhdr.dh_end;
3562+
3563+
if (l2dhdr.dh_start_lbps[0].lbp_daddr == 0) {
3564+
/* no log blocks to read */
3565+
(void) printf("No log blocks to read\n");
3566+
(void) printf("\n");
3567+
return;
3568+
} else {
3569+
dev.l2ad_hand = lbps[0].lbp_daddr +
3570+
L2BLK_GET_PSIZE((&lbps[0])->lbp_prop);
3571+
}
3572+
3573+
dev.l2ad_first = !!(l2dhdr.dh_flags & L2ARC_DEV_HDR_EVICT_FIRST);
3574+
3575+
for (;;) {
3576+
if (!l2arc_log_blkptr_valid(&dev, &lbps[0]))
3577+
break;
3578+
3579+
psize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop);
3580+
if (pread64(fd, &this_lb, psize, lbps[0].lbp_daddr) != psize) {
3581+
(void) printf("Error while reading next log block\n\n");
3582+
break;
3583+
}
3584+
3585+
fletcher_4_native_varsize(&this_lb, psize, &cksum);
3586+
if (!ZIO_CHECKSUM_EQUAL(cksum, lbps[0].lbp_cksum)) {
3587+
failed++;
3588+
(void) printf("Invalid cksum\n");
3589+
dump_l2arc_log_blkptr(lbps[0]);
3590+
break;
3591+
}
3592+
3593+
switch (L2BLK_GET_COMPRESS((&lbps[0])->lbp_prop)) {
3594+
case ZIO_COMPRESS_OFF:
3595+
break;
3596+
case ZIO_COMPRESS_LZ4:
3597+
abd = abd_alloc_for_io(psize, B_TRUE);
3598+
abd_copy_from_buf_off(abd, &this_lb, 0, psize);
3599+
zio_decompress_data(L2BLK_GET_COMPRESS(
3600+
(&lbps[0])->lbp_prop), abd, &this_lb,
3601+
psize, sizeof (this_lb));
3602+
abd_free(abd);
3603+
break;
3604+
default:
3605+
break;
3606+
}
3607+
3608+
if (this_lb.lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC))
3609+
byteswap_uint64_array(&this_lb, psize);
3610+
3611+
if (this_lb.lb_magic != L2ARC_LOG_BLK_MAGIC) {
3612+
(void) printf("Invalid log block magic\n\n");
3613+
break;
3614+
}
3615+
3616+
i++;
3617+
if (dump_opt['l'] > 1) {
3618+
(void) printf("lb[%4d]\tmagic: %llu\n", i,
3619+
(u_longlong_t)this_lb.lb_magic);
3620+
dump_l2arc_log_blkptr(lbps[0]);
3621+
}
3622+
3623+
if (dump_opt['l'] > 2)
3624+
dump_l2arc_log_entries(l2dhdr.dh_log_blk_ent,
3625+
this_lb.lb_entries, i);
3626+
3627+
if (l2arc_range_check_overlap(lbps[1].lbp_daddr,
3628+
lbps[0].lbp_daddr, dev.l2ad_evict) && !dev.l2ad_first)
3629+
break;
3630+
3631+
lbps[0] = lbps[1];
3632+
lbps[1] = this_lb.lb_prev_lbp;
3633+
}
3634+
3635+
(void) printf("log_blk_count:\t %d with valid cksum\n", i);
3636+
(void) printf("\t\t %d with invalid cksum\n\n", failed);
3637+
}
3638+
3639+
static void
3640+
dump_l2arc_header(int fd)
3641+
{
3642+
l2arc_dev_hdr_phys_t l2dhdr;
3643+
int error = B_FALSE;
3644+
3645+
if (pread64(fd, &l2dhdr, sizeof (l2dhdr),
3646+
VDEV_LABEL_START_SIZE) != sizeof (l2dhdr)) {
3647+
error = B_TRUE;
3648+
} else {
3649+
if (l2dhdr.dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC))
3650+
byteswap_uint64_array(&l2dhdr, sizeof (l2dhdr));
3651+
3652+
if (l2dhdr.dh_magic != L2ARC_DEV_HDR_MAGIC)
3653+
error = B_TRUE;
3654+
}
3655+
3656+
if (error) {
3657+
(void) printf("L2ARC device header not found\n\n");
3658+
} else if (!dump_opt['q']) {
3659+
print_l2arc_header();
3660+
3661+
(void) printf(" magic: %llu\n",
3662+
(u_longlong_t)l2dhdr.dh_magic);
3663+
(void) printf(" version: %llu\n",
3664+
(u_longlong_t)l2dhdr.dh_version);
3665+
(void) printf(" pool_guid: %llu\n",
3666+
(u_longlong_t)l2dhdr.dh_spa_guid);
3667+
(void) printf(" flags: %llu\n",
3668+
(u_longlong_t)l2dhdr.dh_flags);
3669+
(void) printf(" start_lbps[0]: %llu\n",
3670+
(u_longlong_t)
3671+
l2dhdr.dh_start_lbps[0].lbp_daddr);
3672+
(void) printf(" start_lbps[1]: %llu\n",
3673+
(u_longlong_t)
3674+
l2dhdr.dh_start_lbps[1].lbp_daddr);
3675+
(void) printf(" log_blk_ent: %llu\n",
3676+
(u_longlong_t)l2dhdr.dh_log_blk_ent);
3677+
(void) printf(" start: %llu\n",
3678+
(u_longlong_t)l2dhdr.dh_start);
3679+
(void) printf(" end: %llu\n",
3680+
(u_longlong_t)l2dhdr.dh_end);
3681+
(void) printf(" evict: %llu\n\n",
3682+
(u_longlong_t)l2dhdr.dh_evict);
3683+
3684+
dump_l2arc_log_blocks(fd, l2dhdr);
3685+
}
3686+
}
3687+
34773688
static void
34783689
dump_config_from_label(zdb_label_t *label, size_t buflen, int l)
34793690
{
@@ -3639,10 +3850,11 @@ dump_label(const char *dev)
36393850
{
36403851
char path[MAXPATHLEN];
36413852
zdb_label_t labels[VDEV_LABELS];
3642-
uint64_t psize, ashift;
3853+
uint64_t psize, ashift, l2cache;
36433854
struct stat64 statbuf;
36443855
boolean_t config_found = B_FALSE;
36453856
boolean_t error = B_FALSE;
3857+
boolean_t read_l2arc_header = B_FALSE;
36463858
avl_tree_t config_tree;
36473859
avl_tree_t uberblock_tree;
36483860
void *node, *cookie;
@@ -3735,6 +3947,15 @@ dump_label(const char *dev)
37353947
if (nvlist_size(config, &size, NV_ENCODE_XDR) != 0)
37363948
size = buflen;
37373949

3950+
/* If the device is a cache device clear the header. */
3951+
if (!read_l2arc_header) {
3952+
if (nvlist_lookup_uint64(config,
3953+
ZPOOL_CONFIG_POOL_STATE, &l2cache) == 0 &&
3954+
l2cache == POOL_STATE_L2CACHE) {
3955+
read_l2arc_header = B_TRUE;
3956+
}
3957+
}
3958+
37383959
fletcher_4_native_varsize(buf, size, &cksum);
37393960
rec = cksum_record_insert(&config_tree, &cksum, l);
37403961

@@ -3785,6 +4006,12 @@ dump_label(const char *dev)
37854006
nvlist_free(label->config_nv);
37864007
}
37874008

4009+
/*
4010+
* Dump the L2ARC header, if existent.
4011+
*/
4012+
if (read_l2arc_header)
4013+
dump_l2arc_header(fd);
4014+
37884015
cookie = NULL;
37894016
while ((node = avl_destroy_nodes(&config_tree, &cookie)) != NULL)
37904017
umem_free(node, sizeof (cksum_record_t));

configure.ac

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -335,6 +335,7 @@ AC_CONFIG_FILES([
335335
tests/zfs-tests/tests/functional/no_space/Makefile
336336
tests/zfs-tests/tests/functional/nopwrite/Makefile
337337
tests/zfs-tests/tests/functional/online_offline/Makefile
338+
tests/zfs-tests/tests/functional/persist_l2arc/Makefile
338339
tests/zfs-tests/tests/functional/pool_checkpoint/Makefile
339340
tests/zfs-tests/tests/functional/pool_names/Makefile
340341
tests/zfs-tests/tests/functional/poolversion/Makefile

include/sys/arc.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -310,10 +310,14 @@ void arc_fini(void);
310310
void l2arc_add_vdev(spa_t *spa, vdev_t *vd);
311311
void l2arc_remove_vdev(vdev_t *vd);
312312
boolean_t l2arc_vdev_present(vdev_t *vd);
313+
void l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen);
314+
boolean_t l2arc_range_check_overlap(uint64_t bottom, uint64_t top,
315+
uint64_t check);
313316
void l2arc_init(void);
314317
void l2arc_fini(void);
315318
void l2arc_start(void);
316319
void l2arc_stop(void);
320+
void l2arc_spa_rebuild_start(spa_t *spa);
317321

318322
#ifndef _KERNEL
319323
extern boolean_t arc_watch;

0 commit comments

Comments
 (0)