From fdd97e00934b9a1af7f953333ddf4f7c196907f0 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Mon, 7 Aug 2023 16:54:41 -0400 Subject: [PATCH] Refactor dmu_prefetch(). - Split dmu_prefetch_dnode() from dmu_prefetch() into a separate function. It is quite inconvenient to read the code where len = 0 means dnode prefetch instead indirect/data prefetch. One function doing both has no benefits, since the code paths are independent. - Improve dmu_prefetch() handling of long block ranges. Instead of limiting L0 data length to prefetch for to dmu_prefetch_max, make dmu_prefetch_max limit the actual amount of prefetch at the specified level, and, if there is more, prefetch all the rest at higher indirection level. It should improve random access times within the prefetched range of any length, reducing importance of specific dmu_prefetch_max value. Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15076 --- include/sys/dmu.h | 1 + module/os/freebsd/zfs/zfs_vnops_os.c | 4 +- module/os/linux/zfs/zfs_vnops_os.c | 7 +- module/zfs/dmu.c | 103 ++++++++++++++++----------- module/zfs/dsl_deadlist.c | 8 +-- module/zfs/spa_log_spacemap.c | 4 +- module/zfs/zvol.c | 2 +- 7 files changed, 72 insertions(+), 57 deletions(-) diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 06b4dc27df..5bdb7c0293 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -889,6 +889,7 @@ extern uint_t zfs_max_recordsize; */ void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, uint64_t len, enum zio_priority pri); +void dmu_prefetch_dnode(objset_t *os, uint64_t object, enum zio_priority pri); typedef struct dmu_object_info { /* All sizes are in bytes unless otherwise indicated. */ diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c index 05f28033be..1ba25bce61 100644 --- a/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/module/os/freebsd/zfs/zfs_vnops_os.c @@ -1869,10 +1869,8 @@ zfs_readdir(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, int *eofp, ASSERT3S(outcount, <=, bufsize); - /* Prefetch znode */ if (prefetch) - dmu_prefetch(os, objnum, 0, 0, 0, - ZIO_PRIORITY_SYNC_READ); + dmu_prefetch_dnode(os, objnum, ZIO_PRIORITY_SYNC_READ); /* * Move to the next entry, fill in the previous offset. diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c index 7c473bc7e9..be528f6e81 100644 --- a/module/os/linux/zfs/zfs_vnops_os.c +++ b/module/os/linux/zfs/zfs_vnops_os.c @@ -1610,11 +1610,8 @@ zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr) if (done) break; - /* Prefetch znode */ - if (prefetch) { - dmu_prefetch(os, objnum, 0, 0, 0, - ZIO_PRIORITY_SYNC_READ); - } + if (prefetch) + dmu_prefetch_dnode(os, objnum, ZIO_PRIORITY_SYNC_READ); /* * Move to the next entry, fill in the previous offset. diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 3215ab1c2a..d82211e6d4 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -695,74 +695,93 @@ dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, const void *tag) } /* - * Issue prefetch i/os for the given blocks. If level is greater than 0, the + * Issue prefetch I/Os for the given blocks. If level is greater than 0, the * indirect blocks prefetched will be those that point to the blocks containing - * the data starting at offset, and continuing to offset + len. + * the data starting at offset, and continuing to offset + len. If the range + * it too long, prefetch the first dmu_prefetch_max bytes as requested, while + * for the rest only a higher level, also fitting within dmu_prefetch_max. It + * should primarily help random reads, since for long sequential reads there is + * a speculative prefetcher. * * Note that if the indirect blocks above the blocks being prefetched are not - * in cache, they will be asynchronously read in. + * in cache, they will be asynchronously read in. Dnode read by dnode_hold() + * is currently synchronous. */ void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, uint64_t len, zio_priority_t pri) { dnode_t *dn; - uint64_t blkid; - int nblks, err; + int64_t level2 = level; + uint64_t start, end, start2, end2; - if (len == 0) { /* they're interested in the bonus buffer */ - dn = DMU_META_DNODE(os); - - if (object == 0 || object >= DN_MAX_OBJECT) - return; - - rw_enter(&dn->dn_struct_rwlock, RW_READER); - blkid = dbuf_whichblock(dn, level, - object * sizeof (dnode_phys_t)); - dbuf_prefetch(dn, level, blkid, pri, 0); - rw_exit(&dn->dn_struct_rwlock); + if (dmu_prefetch_max == 0 || len == 0) { + dmu_prefetch_dnode(os, object, pri); return; } - /* - * See comment before the definition of dmu_prefetch_max. - */ - len = MIN(len, dmu_prefetch_max); - - /* - * XXX - Note, if the dnode for the requested object is not - * already cached, we will do a *synchronous* read in the - * dnode_hold() call. The same is true for any indirects. - */ - err = dnode_hold(os, object, FTAG, &dn); - if (err != 0) + if (dnode_hold(os, object, FTAG, &dn) != 0) return; /* - * offset + len - 1 is the last byte we want to prefetch for, and offset - * is the first. Then dbuf_whichblk(dn, level, off + len - 1) is the - * last block we want to prefetch, and dbuf_whichblock(dn, level, - * offset) is the first. Then the number we need to prefetch is the - * last - first + 1. + * Depending on len we may do two prefetches: blocks [start, end) at + * level, and following blocks [start2, end2) at higher level2. */ rw_enter(&dn->dn_struct_rwlock, RW_READER); - if (level > 0 || dn->dn_datablkshift != 0) { - nblks = dbuf_whichblock(dn, level, offset + len - 1) - - dbuf_whichblock(dn, level, offset) + 1; + if (dn->dn_datablkshift != 0) { + /* + * The object has multiple blocks. Calculate the full range + * of blocks [start, end2) and then split it into two parts, + * so that the first [start, end) fits into dmu_prefetch_max. + */ + start = dbuf_whichblock(dn, level, offset); + end2 = dbuf_whichblock(dn, level, offset + len - 1) + 1; + uint8_t ibs = dn->dn_indblkshift; + uint8_t bs = (level == 0) ? dn->dn_datablkshift : ibs; + uint_t limit = P2ROUNDUP(dmu_prefetch_max, 1 << bs) >> bs; + start2 = end = MIN(end2, start + limit); + + /* + * Find level2 where [start2, end2) fits into dmu_prefetch_max. + */ + uint8_t ibps = ibs - SPA_BLKPTRSHIFT; + limit = P2ROUNDUP(dmu_prefetch_max, 1 << ibs) >> ibs; + do { + level2++; + start2 = P2ROUNDUP(start2, 1 << ibps) >> ibps; + end2 = P2ROUNDUP(end2, 1 << ibps) >> ibps; + } while (end2 - start2 > limit); } else { - nblks = (offset < dn->dn_datablksz); + /* There is only one block. Prefetch it or nothing. */ + start = start2 = end2 = 0; + end = start + (level == 0 && offset < dn->dn_datablksz); } - if (nblks != 0) { - blkid = dbuf_whichblock(dn, level, offset); - for (int i = 0; i < nblks; i++) - dbuf_prefetch(dn, level, blkid + i, pri, 0); - } + for (uint64_t i = start; i < end; i++) + dbuf_prefetch(dn, level, i, pri, 0); + for (uint64_t i = start2; i < end2; i++) + dbuf_prefetch(dn, level2, i, pri, 0); rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); } +/* + * Issue prefetch I/Os for the given object's dnode. + */ +void +dmu_prefetch_dnode(objset_t *os, uint64_t object, zio_priority_t pri) +{ + if (object == 0 || object >= DN_MAX_OBJECT) + return; + + dnode_t *dn = DMU_META_DNODE(os); + rw_enter(&dn->dn_struct_rwlock, RW_READER); + uint64_t blkid = dbuf_whichblock(dn, 0, object * sizeof (dnode_phys_t)); + dbuf_prefetch(dn, 0, blkid, pri, 0); + rw_exit(&dn->dn_struct_rwlock); +} + /* * Get the next "chunk" of file data to free. We traverse the file from * the end so that the file gets shorter over time (if we crashes in the diff --git a/module/zfs/dsl_deadlist.c b/module/zfs/dsl_deadlist.c index ac30a37081..e6c8d4be13 100644 --- a/module/zfs/dsl_deadlist.c +++ b/module/zfs/dsl_deadlist.c @@ -173,8 +173,8 @@ dsl_deadlist_load_tree(dsl_deadlist_t *dl) * in parallel. Then open them all in a second pass. */ dle->dle_bpobj.bpo_object = za.za_first_integer; - dmu_prefetch(dl->dl_os, dle->dle_bpobj.bpo_object, - 0, 0, 0, ZIO_PRIORITY_SYNC_READ); + dmu_prefetch_dnode(dl->dl_os, dle->dle_bpobj.bpo_object, + ZIO_PRIORITY_SYNC_READ); avl_add(&dl->dl_tree, dle); } @@ -235,8 +235,8 @@ dsl_deadlist_load_cache(dsl_deadlist_t *dl) * in parallel. Then open them all in a second pass. */ dlce->dlce_bpobj = za.za_first_integer; - dmu_prefetch(dl->dl_os, dlce->dlce_bpobj, - 0, 0, 0, ZIO_PRIORITY_SYNC_READ); + dmu_prefetch_dnode(dl->dl_os, dlce->dlce_bpobj, + ZIO_PRIORITY_SYNC_READ); avl_add(&dl->dl_cache, dlce); } VERIFY3U(error, ==, ENOENT); diff --git a/module/zfs/spa_log_spacemap.c b/module/zfs/spa_log_spacemap.c index 2878e68c6e..cf05158b63 100644 --- a/module/zfs/spa_log_spacemap.c +++ b/module/zfs/spa_log_spacemap.c @@ -1147,8 +1147,8 @@ spa_ld_log_sm_data(spa_t *spa) /* Prefetch log spacemaps dnodes. */ for (sls = avl_first(&spa->spa_sm_logs_by_txg); sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) { - dmu_prefetch(spa_meta_objset(spa), sls->sls_sm_obj, - 0, 0, 0, ZIO_PRIORITY_SYNC_READ); + dmu_prefetch_dnode(spa_meta_objset(spa), sls->sls_sm_obj, + ZIO_PRIORITY_SYNC_READ); } uint_t pn = 0; diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index 20ea71f233..89b523ddd9 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -931,7 +931,7 @@ zvol_prefetch_minors_impl(void *arg) job->error = dmu_objset_own(dsname, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); if (job->error == 0) { - dmu_prefetch(os, ZVOL_OBJ, 0, 0, 0, ZIO_PRIORITY_SYNC_READ); + dmu_prefetch_dnode(os, ZVOL_OBJ, ZIO_PRIORITY_SYNC_READ); dmu_objset_disown(os, B_TRUE, FTAG); } }