diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5 index 7a86c56b40..26e9b5eb05 100644 --- a/man/man5/zfs-module-parameters.5 +++ b/man/man5/zfs-module-parameters.5 @@ -2909,6 +2909,19 @@ The number of bytes which should be prefetched during a pool traversal Default value: \fB52,428,800\fR. .RE +.sp +.ne 2 +.na +\fBzfs_traverse_indirect_prefetch_limit\fR (int) +.ad +.RS 12n +The number of blocks pointed by indirect (non-L0) block, which should be +prefetched during a pool traversal (eg: \fBzfs send\fR or other data +crawling operations) +.sp +Default value: \fB32\fR. +.RE + .sp .ne 2 .na diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c index 31db49dae6..862c0bf404 100644 --- a/module/zfs/dmu_traverse.c +++ b/module/zfs/dmu_traverse.c @@ -41,6 +41,7 @@ int32_t zfs_pd_bytes_max = 50 * 1024 * 1024; /* 50MB */ int32_t send_holes_without_birth_time = 1; +int32_t zfs_traverse_indirect_prefetch_limit = 32; typedef struct prefetch_data { kmutex_t pd_mtx; @@ -176,7 +177,10 @@ resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp, return (RESUME_SKIP_NONE); } -static void +/* + * Returns B_TRUE, if prefetch read is issued, otherwise B_FALSE. + */ +static boolean_t traverse_prefetch_metadata(traverse_data_t *td, const blkptr_t *bp, const zbookmark_phys_t *zb) { @@ -184,18 +188,18 @@ traverse_prefetch_metadata(traverse_data_t *td, int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; if (!(td->td_flags & TRAVERSE_PREFETCH_METADATA)) - return; + return (B_FALSE); /* * If we are in the process of resuming, don't prefetch, because * some children will not be needed (and in fact may have already * been freed). */ if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume)) - return; + return (B_FALSE); if (BP_IS_HOLE(bp) || bp->blk_birth <= td->td_min_txg) - return; + return (B_FALSE); if (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE) - return; + return (B_FALSE); ASSERT(!BP_IS_REDACTED(bp)); if ((td->td_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp)) @@ -203,6 +207,7 @@ traverse_prefetch_metadata(traverse_data_t *td, (void) arc_read(NULL, td->td_spa, bp, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); + return (B_TRUE); } static boolean_t @@ -295,7 +300,8 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, if (BP_GET_LEVEL(bp) > 0) { uint32_t flags = ARC_FLAG_WAIT; - int32_t i; + int32_t i, ptidx, pidx; + uint32_t prefetchlimit; int32_t epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; zbookmark_phys_t *czb; @@ -308,16 +314,46 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, czb = kmem_alloc(sizeof (zbookmark_phys_t), KM_SLEEP); + /* + * When performing a traversal it is beneficial to + * asynchronously read-ahead the upcoming indirect + * blocks since they will be needed shortly. However, + * since a 128k indirect (non-L0) block may contain up + * to 1024 128-byte block pointers, its preferable to not + * prefetch them all at once. Issuing a large number of + * async reads may effect performance, and the earlier + * the indirect blocks are prefetched the less likely + * they are to still be resident in the ARC when needed. + * Therefore, prefetching indirect blocks is limited to + * zfs_traverse_indirect_prefetch_limit=32 blocks by + * default. + * + * pidx: Index for which next prefetch to be issued. + * ptidx: Index at which next prefetch to be triggered. + */ + ptidx = 0; + pidx = 1; + prefetchlimit = zfs_traverse_indirect_prefetch_limit; for (i = 0; i < epb; i++) { - SET_BOOKMARK(czb, zb->zb_objset, zb->zb_object, - zb->zb_level - 1, - zb->zb_blkid * epb + i); - traverse_prefetch_metadata(td, - &((blkptr_t *)buf->b_data)[i], czb); - } + if (prefetchlimit && i == ptidx) { + ASSERT3S(ptidx, <=, pidx); + for (uint32_t prefetched = 0; pidx < epb && + prefetched < prefetchlimit; pidx++) { + SET_BOOKMARK(czb, zb->zb_objset, + zb->zb_object, zb->zb_level - 1, + zb->zb_blkid * epb + pidx); + if (traverse_prefetch_metadata(td, + &((blkptr_t *)buf->b_data)[pidx], + czb) == B_TRUE) { + prefetched++; + if (prefetched == + MAX(prefetchlimit / 2, 1)) + ptidx = pidx; + } + } + } - /* recursively visitbp() blocks below this */ - for (i = 0; i < epb; i++) { + /* recursively visitbp() blocks below this */ SET_BOOKMARK(czb, zb->zb_objset, zb->zb_object, zb->zb_level - 1, zb->zb_blkid * epb + i); @@ -777,6 +813,9 @@ EXPORT_SYMBOL(traverse_pool); ZFS_MODULE_PARAM(zfs, zfs_, pd_bytes_max, INT, ZMOD_RW, "Max number of bytes to prefetch"); +ZFS_MODULE_PARAM(zfs, zfs_, traverse_indirect_prefetch_limit, INT, ZMOD_RW, + "Traverse prefetch number of blocks pointed by indirect block"); + #if defined(_KERNEL) module_param_named(ignore_hole_birth, send_holes_without_birth_time, int, 0644); MODULE_PARM_DESC(ignore_hole_birth,