fat zap should prefetch when iterating

When iterating over a ZAP object, we're almost always certain to iterate
over the entire object. If there are multiple leaf blocks, we can
realize a performance win by issuing reads for all the leaf blocks in
parallel when the iteration begins.

For example, if we have 10,000 snapshots, "zfs destroy -nv
pool/fs@1%9999" can take 30 minutes when the cache is cold. This change
provides a >3x performance improvement, by issuing the reads for all ~64
blocks of each ZAP object in parallel.

Reviewed-by: Andreas Dilger <andreas.dilger@whamcloud.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
External-issue: DLPX-58347
Closes #8862
This commit is contained in:
Matthew Ahrens 2019-06-12 13:13:09 -07:00 committed by Tony Hutter
parent 812c36fc71
commit 516a08ebb4
6 changed files with 140 additions and 9 deletions

View File

@ -21,7 +21,7 @@
/* /*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
* Copyright 2017 Nexenta Systems, Inc. * Copyright 2017 Nexenta Systems, Inc.
*/ */
@ -350,6 +350,7 @@ typedef struct zap_cursor {
uint64_t zc_serialized; uint64_t zc_serialized;
uint64_t zc_hash; uint64_t zc_hash;
uint32_t zc_cd; uint32_t zc_cd;
boolean_t zc_prefetch;
} zap_cursor_t; } zap_cursor_t;
typedef struct { typedef struct {
@ -375,7 +376,9 @@ typedef struct {
* Initialize a zap cursor, pointing to the "first" attribute of the * Initialize a zap cursor, pointing to the "first" attribute of the
* zapobj. You must _fini the cursor when you are done with it. * zapobj. You must _fini the cursor when you are done with it.
*/ */
void zap_cursor_init(zap_cursor_t *zc, objset_t *ds, uint64_t zapobj); void zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj);
void zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os,
uint64_t zapobj);
void zap_cursor_fini(zap_cursor_t *zc); void zap_cursor_fini(zap_cursor_t *zc);
/* /*

View File

@ -104,6 +104,18 @@ to a log2 fraction of the target arc size.
Default value: \fB6\fR. Default value: \fB6\fR.
.RE .RE
.sp
.ne 2
.na
\fBdmu_prefetch_max\fR (int)
.ad
.RS 12n
Limit the amount we can prefetch with one call to this amount (in bytes).
This helps to limit the amount of memory that can be used by prefetching.
.sp
Default value: \fB134,217,728\fR (128MB).
.RE
.sp .sp
.ne 2 .ne 2
.na .na
@ -502,6 +514,19 @@ regular reads (but there's no reason it has to be the same).
Default value: \fB32,768\fR. Default value: \fB32,768\fR.
.RE .RE
.sp
.ne 2
.na
\fBzap_iterate_prefetch\fR (int)
.ad
.RS 12n
If this is set, when we start iterating over a ZAP object, zfs will prefetch
the entire object (all leaf blocks). However, this is limited by
\fBdmu_prefetch_max\fR.
.sp
Use \fB1\fR for on (default) and \fB0\fR for off.
.RE
.sp .sp
.ne 2 .ne 2
.na .na

View File

@ -21,6 +21,7 @@
/* /*
* Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2018 by Delphix. All rights reserved.
*/ */
#include <sys/zfs_context.h> #include <sys/zfs_context.h>
@ -117,7 +118,18 @@ ddt_zap_walk(objset_t *os, uint64_t object, ddt_entry_t *dde, uint64_t *walk)
zap_attribute_t za; zap_attribute_t za;
int error; int error;
if (*walk == 0) {
/*
* We don't want to prefetch the entire ZAP object, because
* it can be enormous. Also the primary use of DDT iteration
* is for scrubbing, in which case we will be issuing many
* scrub I/Os for each ZAP block that we read in, so
* reading the ZAP is unlikely to be the bottleneck.
*/
zap_cursor_init_noprefetch(&zc, os, object);
} else {
zap_cursor_init_serialized(&zc, os, object, *walk); zap_cursor_init_serialized(&zc, os, object, *walk);
}
if ((error = zap_cursor_retrieve(&zc, &za)) == 0) { if ((error = zap_cursor_retrieve(&zc, &za)) == 0) {
uchar_t cbuf[sizeof (dde->dde_phys) + 1]; uchar_t cbuf[sizeof (dde->dde_phys) + 1];
uint64_t csize = za.za_num_integers; uint64_t csize = za.za_num_integers;

View File

@ -81,6 +81,13 @@ int zfs_dmu_offset_next_sync = 0;
*/ */
int zfs_object_remap_one_indirect_delay_ms = 0; int zfs_object_remap_one_indirect_delay_ms = 0;
/*
* Limit the amount we can prefetch with one call to this amount. This
* helps to limit the amount of memory that can be used by prefetching.
* Larger objects should be prefetched a bit at a time.
*/
int dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE;
const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
{DMU_BSWAP_UINT8, TRUE, FALSE, FALSE, "unallocated" }, {DMU_BSWAP_UINT8, TRUE, FALSE, FALSE, "unallocated" },
{DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "object directory" }, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "object directory" },
@ -667,6 +674,11 @@ dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
return; return;
} }
/*
* See comment before the definition of dmu_prefetch_max.
*/
len = MIN(len, dmu_prefetch_max);
/* /*
* XXX - Note, if the dnode for the requested object is not * XXX - Note, if the dnode for the requested object is not
* already cached, we will do a *synchronous* read in the * already cached, we will do a *synchronous* read in the
@ -2629,6 +2641,10 @@ module_param(zfs_dmu_offset_next_sync, int, 0644);
MODULE_PARM_DESC(zfs_dmu_offset_next_sync, MODULE_PARM_DESC(zfs_dmu_offset_next_sync,
"Enable forcing txg sync to find holes"); "Enable forcing txg sync to find holes");
module_param(dmu_prefetch_max, int, 0644);
MODULE_PARM_DESC(dmu_prefetch_max,
"Limit one prefetch call to this size");
/* END CSTYLED */ /* END CSTYLED */
#endif #endif

View File

@ -20,7 +20,7 @@
*/ */
/* /*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2016 by Delphix. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/ */
@ -49,6 +49,36 @@
#include <sys/zap_impl.h> #include <sys/zap_impl.h>
#include <sys/zap_leaf.h> #include <sys/zap_leaf.h>
/*
* If zap_iterate_prefetch is set, we will prefetch the entire ZAP object
* (all leaf blocks) when we start iterating over it.
*
* For zap_cursor_init(), the callers all intend to iterate through all the
* entries. There are a few cases where an error (typically i/o error) could
* cause it to bail out early.
*
* For zap_cursor_init_serialized(), there are callers that do the iteration
* outside of ZFS. Typically they would iterate over everything, but we
* don't have control of that. E.g. zfs_ioc_snapshot_list_next(),
* zcp_snapshots_iter(), and other iterators over things in the MOS - these
* are called by /sbin/zfs and channel programs. The other example is
* zfs_readdir() which iterates over directory entries for the getdents()
* syscall. /sbin/ls iterates to the end (unless it receives a signal), but
* userland doesn't have to.
*
* Given that the ZAP entries aren't returned in a specific order, the only
* legitimate use cases for partial iteration would be:
*
* 1. Pagination: e.g. you only want to display 100 entries at a time, so you
* get the first 100 and then wait for the user to hit "next page", which
* they may never do).
*
* 2. You want to know if there are more than X entries, without relying on
* the zfs-specific implementation of the directory's st_size (which is
* the number of entries).
*/
int zap_iterate_prefetch = B_TRUE;
int fzap_default_block_shift = 14; /* 16k blocksize */ int fzap_default_block_shift = 14; /* 16k blocksize */
extern inline zap_phys_t *zap_f_phys(zap_t *zap); extern inline zap_phys_t *zap_f_phys(zap_t *zap);
@ -1189,6 +1219,21 @@ fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za)
/* retrieve the next entry at or after zc_hash/zc_cd */ /* retrieve the next entry at or after zc_hash/zc_cd */
/* if no entry, return ENOENT */ /* if no entry, return ENOENT */
/*
* If we are reading from the beginning, we're almost certain to
* iterate over the entire ZAP object. If there are multiple leaf
* blocks (freeblk > 2), prefetch the whole object (up to
* dmu_prefetch_max bytes), so that we read the leaf blocks
* concurrently. (Unless noprefetch was requested via
* zap_cursor_init_noprefetch()).
*/
if (zc->zc_hash == 0 && zap_iterate_prefetch &&
zc->zc_prefetch && zap_f_phys(zap)->zap_freeblk > 2) {
dmu_prefetch(zc->zc_objset, zc->zc_zapobj, 0, 0,
zap_f_phys(zap)->zap_freeblk << FZAP_BLOCK_SHIFT(zap),
ZIO_PRIORITY_ASYNC_READ);
}
if (zc->zc_leaf && if (zc->zc_leaf &&
(ZAP_HASH_IDX(zc->zc_hash, (ZAP_HASH_IDX(zc->zc_hash,
zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix_len) != zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix_len) !=
@ -1333,3 +1378,12 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs)
} }
} }
} }
#if defined(_KERNEL)
/* BEGIN CSTYLED */
module_param(zap_iterate_prefetch, int, 0644);
MODULE_PARM_DESC(zap_iterate_prefetch,
"When iterating ZAP object, prefetch it");
/* END CSTYLED */
#endif

View File

@ -21,7 +21,7 @@
/* /*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2017 by Delphix. All rights reserved. * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright 2017 Nexenta Systems, Inc. * Copyright 2017 Nexenta Systems, Inc.
*/ */
@ -1472,9 +1472,9 @@ zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
* Routines for iterating over the attributes. * Routines for iterating over the attributes.
*/ */
void static void
zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, zap_cursor_init_impl(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
uint64_t serialized) uint64_t serialized, boolean_t prefetch)
{ {
zc->zc_objset = os; zc->zc_objset = os;
zc->zc_zap = NULL; zc->zc_zap = NULL;
@ -1483,12 +1483,33 @@ zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
zc->zc_serialized = serialized; zc->zc_serialized = serialized;
zc->zc_hash = 0; zc->zc_hash = 0;
zc->zc_cd = 0; zc->zc_cd = 0;
zc->zc_prefetch = prefetch;
}
void
zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
uint64_t serialized)
{
zap_cursor_init_impl(zc, os, zapobj, serialized, B_TRUE);
} }
/*
* Initialize a cursor at the beginning of the ZAP object. The entire
* ZAP object will be prefetched.
*/
void void
zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj) zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
{ {
zap_cursor_init_serialized(zc, os, zapobj, 0); zap_cursor_init_impl(zc, os, zapobj, 0, B_TRUE);
}
/*
* Initialize a cursor at the beginning, but request that we not prefetch
* the entire ZAP object.
*/
void
zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
{
zap_cursor_init_impl(zc, os, zapobj, 0, B_FALSE);
} }
void void