fat zap should prefetch when iterating
When iterating over a ZAP object, we're almost always certain to iterate over the entire object. If there are multiple leaf blocks, we can realize a performance win by issuing reads for all the leaf blocks in parallel when the iteration begins. For example, if we have 10,000 snapshots, "zfs destroy -nv pool/fs@1%9999" can take 30 minutes when the cache is cold. This change provides a >3x performance improvement, by issuing the reads for all ~64 blocks of each ZAP object in parallel. Reviewed-by: Andreas Dilger <andreas.dilger@whamcloud.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Matthew Ahrens <mahrens@delphix.com> External-issue: DLPX-58347 Closes #8862
This commit is contained in:
parent
d9cd66e45f
commit
d9b4bf0665
|
@ -21,7 +21,7 @@
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||||
* Copyright (c) 2013 by Delphix. All rights reserved.
|
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
|
||||||
* Copyright 2017 Nexenta Systems, Inc.
|
* Copyright 2017 Nexenta Systems, Inc.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
@ -350,6 +350,7 @@ typedef struct zap_cursor {
|
||||||
uint64_t zc_serialized;
|
uint64_t zc_serialized;
|
||||||
uint64_t zc_hash;
|
uint64_t zc_hash;
|
||||||
uint32_t zc_cd;
|
uint32_t zc_cd;
|
||||||
|
boolean_t zc_prefetch;
|
||||||
} zap_cursor_t;
|
} zap_cursor_t;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
|
@ -375,7 +376,9 @@ typedef struct {
|
||||||
* Initialize a zap cursor, pointing to the "first" attribute of the
|
* Initialize a zap cursor, pointing to the "first" attribute of the
|
||||||
* zapobj. You must _fini the cursor when you are done with it.
|
* zapobj. You must _fini the cursor when you are done with it.
|
||||||
*/
|
*/
|
||||||
void zap_cursor_init(zap_cursor_t *zc, objset_t *ds, uint64_t zapobj);
|
void zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj);
|
||||||
|
void zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os,
|
||||||
|
uint64_t zapobj);
|
||||||
void zap_cursor_fini(zap_cursor_t *zc);
|
void zap_cursor_fini(zap_cursor_t *zc);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
|
@ -104,6 +104,18 @@ to a log2 fraction of the target arc size.
|
||||||
Default value: \fB6\fR.
|
Default value: \fB6\fR.
|
||||||
.RE
|
.RE
|
||||||
|
|
||||||
|
.sp
|
||||||
|
.ne 2
|
||||||
|
.na
|
||||||
|
\fBdmu_prefetch_max\fR (int)
|
||||||
|
.ad
|
||||||
|
.RS 12n
|
||||||
|
Limit the amount we can prefetch with one call to this amount (in bytes).
|
||||||
|
This helps to limit the amount of memory that can be used by prefetching.
|
||||||
|
.sp
|
||||||
|
Default value: \fB134,217,728\fR (128MB).
|
||||||
|
.RE
|
||||||
|
|
||||||
.sp
|
.sp
|
||||||
.ne 2
|
.ne 2
|
||||||
.na
|
.na
|
||||||
|
@ -502,6 +514,19 @@ regular reads (but there's no reason it has to be the same).
|
||||||
Default value: \fB32,768\fR.
|
Default value: \fB32,768\fR.
|
||||||
.RE
|
.RE
|
||||||
|
|
||||||
|
.sp
|
||||||
|
.ne 2
|
||||||
|
.na
|
||||||
|
\fBzap_iterate_prefetch\fR (int)
|
||||||
|
.ad
|
||||||
|
.RS 12n
|
||||||
|
If this is set, when we start iterating over a ZAP object, zfs will prefetch
|
||||||
|
the entire object (all leaf blocks). However, this is limited by
|
||||||
|
\fBdmu_prefetch_max\fR.
|
||||||
|
.sp
|
||||||
|
Use \fB1\fR for on (default) and \fB0\fR for off.
|
||||||
|
.RE
|
||||||
|
|
||||||
.sp
|
.sp
|
||||||
.ne 2
|
.ne 2
|
||||||
.na
|
.na
|
||||||
|
|
|
@ -21,6 +21,7 @@
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
|
* Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||||
|
* Copyright (c) 2018 by Delphix. All rights reserved.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <sys/zfs_context.h>
|
#include <sys/zfs_context.h>
|
||||||
|
@ -117,7 +118,18 @@ ddt_zap_walk(objset_t *os, uint64_t object, ddt_entry_t *dde, uint64_t *walk)
|
||||||
zap_attribute_t za;
|
zap_attribute_t za;
|
||||||
int error;
|
int error;
|
||||||
|
|
||||||
zap_cursor_init_serialized(&zc, os, object, *walk);
|
if (*walk == 0) {
|
||||||
|
/*
|
||||||
|
* We don't want to prefetch the entire ZAP object, because
|
||||||
|
* it can be enormous. Also the primary use of DDT iteration
|
||||||
|
* is for scrubbing, in which case we will be issuing many
|
||||||
|
* scrub I/Os for each ZAP block that we read in, so
|
||||||
|
* reading the ZAP is unlikely to be the bottleneck.
|
||||||
|
*/
|
||||||
|
zap_cursor_init_noprefetch(&zc, os, object);
|
||||||
|
} else {
|
||||||
|
zap_cursor_init_serialized(&zc, os, object, *walk);
|
||||||
|
}
|
||||||
if ((error = zap_cursor_retrieve(&zc, &za)) == 0) {
|
if ((error = zap_cursor_retrieve(&zc, &za)) == 0) {
|
||||||
uchar_t cbuf[sizeof (dde->dde_phys) + 1];
|
uchar_t cbuf[sizeof (dde->dde_phys) + 1];
|
||||||
uint64_t csize = za.za_num_integers;
|
uint64_t csize = za.za_num_integers;
|
||||||
|
|
|
@ -81,6 +81,13 @@ int zfs_dmu_offset_next_sync = 0;
|
||||||
*/
|
*/
|
||||||
int zfs_object_remap_one_indirect_delay_ms = 0;
|
int zfs_object_remap_one_indirect_delay_ms = 0;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Limit the amount we can prefetch with one call to this amount. This
|
||||||
|
* helps to limit the amount of memory that can be used by prefetching.
|
||||||
|
* Larger objects should be prefetched a bit at a time.
|
||||||
|
*/
|
||||||
|
int dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE;
|
||||||
|
|
||||||
const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
|
const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
|
||||||
{DMU_BSWAP_UINT8, TRUE, FALSE, FALSE, "unallocated" },
|
{DMU_BSWAP_UINT8, TRUE, FALSE, FALSE, "unallocated" },
|
||||||
{DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "object directory" },
|
{DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "object directory" },
|
||||||
|
@ -667,6 +674,11 @@ dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* See comment before the definition of dmu_prefetch_max.
|
||||||
|
*/
|
||||||
|
len = MIN(len, dmu_prefetch_max);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* XXX - Note, if the dnode for the requested object is not
|
* XXX - Note, if the dnode for the requested object is not
|
||||||
* already cached, we will do a *synchronous* read in the
|
* already cached, we will do a *synchronous* read in the
|
||||||
|
@ -2629,6 +2641,10 @@ module_param(zfs_dmu_offset_next_sync, int, 0644);
|
||||||
MODULE_PARM_DESC(zfs_dmu_offset_next_sync,
|
MODULE_PARM_DESC(zfs_dmu_offset_next_sync,
|
||||||
"Enable forcing txg sync to find holes");
|
"Enable forcing txg sync to find holes");
|
||||||
|
|
||||||
|
module_param(dmu_prefetch_max, int, 0644);
|
||||||
|
MODULE_PARM_DESC(dmu_prefetch_max,
|
||||||
|
"Limit one prefetch call to this size");
|
||||||
|
|
||||||
/* END CSTYLED */
|
/* END CSTYLED */
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -20,7 +20,7 @@
|
||||||
*/
|
*/
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||||
* Copyright (c) 2012, 2016 by Delphix. All rights reserved.
|
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
|
||||||
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
|
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
@ -49,6 +49,36 @@
|
||||||
#include <sys/zap_impl.h>
|
#include <sys/zap_impl.h>
|
||||||
#include <sys/zap_leaf.h>
|
#include <sys/zap_leaf.h>
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If zap_iterate_prefetch is set, we will prefetch the entire ZAP object
|
||||||
|
* (all leaf blocks) when we start iterating over it.
|
||||||
|
*
|
||||||
|
* For zap_cursor_init(), the callers all intend to iterate through all the
|
||||||
|
* entries. There are a few cases where an error (typically i/o error) could
|
||||||
|
* cause it to bail out early.
|
||||||
|
*
|
||||||
|
* For zap_cursor_init_serialized(), there are callers that do the iteration
|
||||||
|
* outside of ZFS. Typically they would iterate over everything, but we
|
||||||
|
* don't have control of that. E.g. zfs_ioc_snapshot_list_next(),
|
||||||
|
* zcp_snapshots_iter(), and other iterators over things in the MOS - these
|
||||||
|
* are called by /sbin/zfs and channel programs. The other example is
|
||||||
|
* zfs_readdir() which iterates over directory entries for the getdents()
|
||||||
|
* syscall. /sbin/ls iterates to the end (unless it receives a signal), but
|
||||||
|
* userland doesn't have to.
|
||||||
|
*
|
||||||
|
* Given that the ZAP entries aren't returned in a specific order, the only
|
||||||
|
* legitimate use cases for partial iteration would be:
|
||||||
|
*
|
||||||
|
* 1. Pagination: e.g. you only want to display 100 entries at a time, so you
|
||||||
|
* get the first 100 and then wait for the user to hit "next page", which
|
||||||
|
* they may never do).
|
||||||
|
*
|
||||||
|
* 2. You want to know if there are more than X entries, without relying on
|
||||||
|
* the zfs-specific implementation of the directory's st_size (which is
|
||||||
|
* the number of entries).
|
||||||
|
*/
|
||||||
|
int zap_iterate_prefetch = B_TRUE;
|
||||||
|
|
||||||
int fzap_default_block_shift = 14; /* 16k blocksize */
|
int fzap_default_block_shift = 14; /* 16k blocksize */
|
||||||
|
|
||||||
extern inline zap_phys_t *zap_f_phys(zap_t *zap);
|
extern inline zap_phys_t *zap_f_phys(zap_t *zap);
|
||||||
|
@ -1189,6 +1219,21 @@ fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za)
|
||||||
/* retrieve the next entry at or after zc_hash/zc_cd */
|
/* retrieve the next entry at or after zc_hash/zc_cd */
|
||||||
/* if no entry, return ENOENT */
|
/* if no entry, return ENOENT */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If we are reading from the beginning, we're almost certain to
|
||||||
|
* iterate over the entire ZAP object. If there are multiple leaf
|
||||||
|
* blocks (freeblk > 2), prefetch the whole object (up to
|
||||||
|
* dmu_prefetch_max bytes), so that we read the leaf blocks
|
||||||
|
* concurrently. (Unless noprefetch was requested via
|
||||||
|
* zap_cursor_init_noprefetch()).
|
||||||
|
*/
|
||||||
|
if (zc->zc_hash == 0 && zap_iterate_prefetch &&
|
||||||
|
zc->zc_prefetch && zap_f_phys(zap)->zap_freeblk > 2) {
|
||||||
|
dmu_prefetch(zc->zc_objset, zc->zc_zapobj, 0, 0,
|
||||||
|
zap_f_phys(zap)->zap_freeblk << FZAP_BLOCK_SHIFT(zap),
|
||||||
|
ZIO_PRIORITY_ASYNC_READ);
|
||||||
|
}
|
||||||
|
|
||||||
if (zc->zc_leaf &&
|
if (zc->zc_leaf &&
|
||||||
(ZAP_HASH_IDX(zc->zc_hash,
|
(ZAP_HASH_IDX(zc->zc_hash,
|
||||||
zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix_len) !=
|
zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix_len) !=
|
||||||
|
@ -1333,3 +1378,12 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if defined(_KERNEL)
|
||||||
|
/* BEGIN CSTYLED */
|
||||||
|
module_param(zap_iterate_prefetch, int, 0644);
|
||||||
|
MODULE_PARM_DESC(zap_iterate_prefetch,
|
||||||
|
"When iterating ZAP object, prefetch it");
|
||||||
|
|
||||||
|
/* END CSTYLED */
|
||||||
|
#endif
|
||||||
|
|
|
@ -21,7 +21,7 @@
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||||
* Copyright (c) 2011, 2017 by Delphix. All rights reserved.
|
* Copyright (c) 2011, 2018 by Delphix. All rights reserved.
|
||||||
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
|
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
|
||||||
* Copyright 2017 Nexenta Systems, Inc.
|
* Copyright 2017 Nexenta Systems, Inc.
|
||||||
*/
|
*/
|
||||||
|
@ -1472,9 +1472,9 @@ zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
|
||||||
* Routines for iterating over the attributes.
|
* Routines for iterating over the attributes.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
void
|
static void
|
||||||
zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
|
zap_cursor_init_impl(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
|
||||||
uint64_t serialized)
|
uint64_t serialized, boolean_t prefetch)
|
||||||
{
|
{
|
||||||
zc->zc_objset = os;
|
zc->zc_objset = os;
|
||||||
zc->zc_zap = NULL;
|
zc->zc_zap = NULL;
|
||||||
|
@ -1483,12 +1483,33 @@ zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
|
||||||
zc->zc_serialized = serialized;
|
zc->zc_serialized = serialized;
|
||||||
zc->zc_hash = 0;
|
zc->zc_hash = 0;
|
||||||
zc->zc_cd = 0;
|
zc->zc_cd = 0;
|
||||||
|
zc->zc_prefetch = prefetch;
|
||||||
|
}
|
||||||
|
void
|
||||||
|
zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
|
||||||
|
uint64_t serialized)
|
||||||
|
{
|
||||||
|
zap_cursor_init_impl(zc, os, zapobj, serialized, B_TRUE);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Initialize a cursor at the beginning of the ZAP object. The entire
|
||||||
|
* ZAP object will be prefetched.
|
||||||
|
*/
|
||||||
void
|
void
|
||||||
zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
|
zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
|
||||||
{
|
{
|
||||||
zap_cursor_init_serialized(zc, os, zapobj, 0);
|
zap_cursor_init_impl(zc, os, zapobj, 0, B_TRUE);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Initialize a cursor at the beginning, but request that we not prefetch
|
||||||
|
* the entire ZAP object.
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
|
||||||
|
{
|
||||||
|
zap_cursor_init_impl(zc, os, zapobj, 0, B_FALSE);
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
|
|
Loading…
Reference in New Issue