Refactor dmu_prefetch().
- Split dmu_prefetch_dnode() from dmu_prefetch() into a separate function. It is quite inconvenient to read the code where len = 0 means dnode prefetch instead indirect/data prefetch. One function doing both has no benefits, since the code paths are independent. - Improve dmu_prefetch() handling of long block ranges. Instead of limiting L0 data length to prefetch for to dmu_prefetch_max, make dmu_prefetch_max limit the actual amount of prefetch at the specified level, and, if there is more, prefetch all the rest at higher indirection level. It should improve random access times within the prefetched range of any length, reducing importance of specific dmu_prefetch_max value. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Alexander Motin <mav@FreeBSD.org> Sponsored by: iXsystems, Inc. Closes #15076
This commit is contained in:
parent
3b8817db96
commit
fdd97e0093
|
@ -889,6 +889,7 @@ extern uint_t zfs_max_recordsize;
|
||||||
*/
|
*/
|
||||||
void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
|
void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
|
||||||
uint64_t len, enum zio_priority pri);
|
uint64_t len, enum zio_priority pri);
|
||||||
|
void dmu_prefetch_dnode(objset_t *os, uint64_t object, enum zio_priority pri);
|
||||||
|
|
||||||
typedef struct dmu_object_info {
|
typedef struct dmu_object_info {
|
||||||
/* All sizes are in bytes unless otherwise indicated. */
|
/* All sizes are in bytes unless otherwise indicated. */
|
||||||
|
|
|
@ -1869,10 +1869,8 @@ zfs_readdir(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, int *eofp,
|
||||||
|
|
||||||
ASSERT3S(outcount, <=, bufsize);
|
ASSERT3S(outcount, <=, bufsize);
|
||||||
|
|
||||||
/* Prefetch znode */
|
|
||||||
if (prefetch)
|
if (prefetch)
|
||||||
dmu_prefetch(os, objnum, 0, 0, 0,
|
dmu_prefetch_dnode(os, objnum, ZIO_PRIORITY_SYNC_READ);
|
||||||
ZIO_PRIORITY_SYNC_READ);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Move to the next entry, fill in the previous offset.
|
* Move to the next entry, fill in the previous offset.
|
||||||
|
|
|
@ -1610,11 +1610,8 @@ zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr)
|
||||||
if (done)
|
if (done)
|
||||||
break;
|
break;
|
||||||
|
|
||||||
/* Prefetch znode */
|
if (prefetch)
|
||||||
if (prefetch) {
|
dmu_prefetch_dnode(os, objnum, ZIO_PRIORITY_SYNC_READ);
|
||||||
dmu_prefetch(os, objnum, 0, 0, 0,
|
|
||||||
ZIO_PRIORITY_SYNC_READ);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Move to the next entry, fill in the previous offset.
|
* Move to the next entry, fill in the previous offset.
|
||||||
|
|
103
module/zfs/dmu.c
103
module/zfs/dmu.c
|
@ -695,74 +695,93 @@ dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, const void *tag)
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Issue prefetch i/os for the given blocks. If level is greater than 0, the
|
* Issue prefetch I/Os for the given blocks. If level is greater than 0, the
|
||||||
* indirect blocks prefetched will be those that point to the blocks containing
|
* indirect blocks prefetched will be those that point to the blocks containing
|
||||||
* the data starting at offset, and continuing to offset + len.
|
* the data starting at offset, and continuing to offset + len. If the range
|
||||||
|
* it too long, prefetch the first dmu_prefetch_max bytes as requested, while
|
||||||
|
* for the rest only a higher level, also fitting within dmu_prefetch_max. It
|
||||||
|
* should primarily help random reads, since for long sequential reads there is
|
||||||
|
* a speculative prefetcher.
|
||||||
*
|
*
|
||||||
* Note that if the indirect blocks above the blocks being prefetched are not
|
* Note that if the indirect blocks above the blocks being prefetched are not
|
||||||
* in cache, they will be asynchronously read in.
|
* in cache, they will be asynchronously read in. Dnode read by dnode_hold()
|
||||||
|
* is currently synchronous.
|
||||||
*/
|
*/
|
||||||
void
|
void
|
||||||
dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
|
dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
|
||||||
uint64_t len, zio_priority_t pri)
|
uint64_t len, zio_priority_t pri)
|
||||||
{
|
{
|
||||||
dnode_t *dn;
|
dnode_t *dn;
|
||||||
uint64_t blkid;
|
int64_t level2 = level;
|
||||||
int nblks, err;
|
uint64_t start, end, start2, end2;
|
||||||
|
|
||||||
if (len == 0) { /* they're interested in the bonus buffer */
|
if (dmu_prefetch_max == 0 || len == 0) {
|
||||||
dn = DMU_META_DNODE(os);
|
dmu_prefetch_dnode(os, object, pri);
|
||||||
|
|
||||||
if (object == 0 || object >= DN_MAX_OBJECT)
|
|
||||||
return;
|
|
||||||
|
|
||||||
rw_enter(&dn->dn_struct_rwlock, RW_READER);
|
|
||||||
blkid = dbuf_whichblock(dn, level,
|
|
||||||
object * sizeof (dnode_phys_t));
|
|
||||||
dbuf_prefetch(dn, level, blkid, pri, 0);
|
|
||||||
rw_exit(&dn->dn_struct_rwlock);
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
if (dnode_hold(os, object, FTAG, &dn) != 0)
|
||||||
* See comment before the definition of dmu_prefetch_max.
|
|
||||||
*/
|
|
||||||
len = MIN(len, dmu_prefetch_max);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* XXX - Note, if the dnode for the requested object is not
|
|
||||||
* already cached, we will do a *synchronous* read in the
|
|
||||||
* dnode_hold() call. The same is true for any indirects.
|
|
||||||
*/
|
|
||||||
err = dnode_hold(os, object, FTAG, &dn);
|
|
||||||
if (err != 0)
|
|
||||||
return;
|
return;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* offset + len - 1 is the last byte we want to prefetch for, and offset
|
* Depending on len we may do two prefetches: blocks [start, end) at
|
||||||
* is the first. Then dbuf_whichblk(dn, level, off + len - 1) is the
|
* level, and following blocks [start2, end2) at higher level2.
|
||||||
* last block we want to prefetch, and dbuf_whichblock(dn, level,
|
|
||||||
* offset) is the first. Then the number we need to prefetch is the
|
|
||||||
* last - first + 1.
|
|
||||||
*/
|
*/
|
||||||
rw_enter(&dn->dn_struct_rwlock, RW_READER);
|
rw_enter(&dn->dn_struct_rwlock, RW_READER);
|
||||||
if (level > 0 || dn->dn_datablkshift != 0) {
|
if (dn->dn_datablkshift != 0) {
|
||||||
nblks = dbuf_whichblock(dn, level, offset + len - 1) -
|
/*
|
||||||
dbuf_whichblock(dn, level, offset) + 1;
|
* The object has multiple blocks. Calculate the full range
|
||||||
|
* of blocks [start, end2) and then split it into two parts,
|
||||||
|
* so that the first [start, end) fits into dmu_prefetch_max.
|
||||||
|
*/
|
||||||
|
start = dbuf_whichblock(dn, level, offset);
|
||||||
|
end2 = dbuf_whichblock(dn, level, offset + len - 1) + 1;
|
||||||
|
uint8_t ibs = dn->dn_indblkshift;
|
||||||
|
uint8_t bs = (level == 0) ? dn->dn_datablkshift : ibs;
|
||||||
|
uint_t limit = P2ROUNDUP(dmu_prefetch_max, 1 << bs) >> bs;
|
||||||
|
start2 = end = MIN(end2, start + limit);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Find level2 where [start2, end2) fits into dmu_prefetch_max.
|
||||||
|
*/
|
||||||
|
uint8_t ibps = ibs - SPA_BLKPTRSHIFT;
|
||||||
|
limit = P2ROUNDUP(dmu_prefetch_max, 1 << ibs) >> ibs;
|
||||||
|
do {
|
||||||
|
level2++;
|
||||||
|
start2 = P2ROUNDUP(start2, 1 << ibps) >> ibps;
|
||||||
|
end2 = P2ROUNDUP(end2, 1 << ibps) >> ibps;
|
||||||
|
} while (end2 - start2 > limit);
|
||||||
} else {
|
} else {
|
||||||
nblks = (offset < dn->dn_datablksz);
|
/* There is only one block. Prefetch it or nothing. */
|
||||||
|
start = start2 = end2 = 0;
|
||||||
|
end = start + (level == 0 && offset < dn->dn_datablksz);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (nblks != 0) {
|
for (uint64_t i = start; i < end; i++)
|
||||||
blkid = dbuf_whichblock(dn, level, offset);
|
dbuf_prefetch(dn, level, i, pri, 0);
|
||||||
for (int i = 0; i < nblks; i++)
|
for (uint64_t i = start2; i < end2; i++)
|
||||||
dbuf_prefetch(dn, level, blkid + i, pri, 0);
|
dbuf_prefetch(dn, level2, i, pri, 0);
|
||||||
}
|
|
||||||
rw_exit(&dn->dn_struct_rwlock);
|
rw_exit(&dn->dn_struct_rwlock);
|
||||||
|
|
||||||
dnode_rele(dn, FTAG);
|
dnode_rele(dn, FTAG);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Issue prefetch I/Os for the given object's dnode.
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
dmu_prefetch_dnode(objset_t *os, uint64_t object, zio_priority_t pri)
|
||||||
|
{
|
||||||
|
if (object == 0 || object >= DN_MAX_OBJECT)
|
||||||
|
return;
|
||||||
|
|
||||||
|
dnode_t *dn = DMU_META_DNODE(os);
|
||||||
|
rw_enter(&dn->dn_struct_rwlock, RW_READER);
|
||||||
|
uint64_t blkid = dbuf_whichblock(dn, 0, object * sizeof (dnode_phys_t));
|
||||||
|
dbuf_prefetch(dn, 0, blkid, pri, 0);
|
||||||
|
rw_exit(&dn->dn_struct_rwlock);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Get the next "chunk" of file data to free. We traverse the file from
|
* Get the next "chunk" of file data to free. We traverse the file from
|
||||||
* the end so that the file gets shorter over time (if we crashes in the
|
* the end so that the file gets shorter over time (if we crashes in the
|
||||||
|
|
|
@ -173,8 +173,8 @@ dsl_deadlist_load_tree(dsl_deadlist_t *dl)
|
||||||
* in parallel. Then open them all in a second pass.
|
* in parallel. Then open them all in a second pass.
|
||||||
*/
|
*/
|
||||||
dle->dle_bpobj.bpo_object = za.za_first_integer;
|
dle->dle_bpobj.bpo_object = za.za_first_integer;
|
||||||
dmu_prefetch(dl->dl_os, dle->dle_bpobj.bpo_object,
|
dmu_prefetch_dnode(dl->dl_os, dle->dle_bpobj.bpo_object,
|
||||||
0, 0, 0, ZIO_PRIORITY_SYNC_READ);
|
ZIO_PRIORITY_SYNC_READ);
|
||||||
|
|
||||||
avl_add(&dl->dl_tree, dle);
|
avl_add(&dl->dl_tree, dle);
|
||||||
}
|
}
|
||||||
|
@ -235,8 +235,8 @@ dsl_deadlist_load_cache(dsl_deadlist_t *dl)
|
||||||
* in parallel. Then open them all in a second pass.
|
* in parallel. Then open them all in a second pass.
|
||||||
*/
|
*/
|
||||||
dlce->dlce_bpobj = za.za_first_integer;
|
dlce->dlce_bpobj = za.za_first_integer;
|
||||||
dmu_prefetch(dl->dl_os, dlce->dlce_bpobj,
|
dmu_prefetch_dnode(dl->dl_os, dlce->dlce_bpobj,
|
||||||
0, 0, 0, ZIO_PRIORITY_SYNC_READ);
|
ZIO_PRIORITY_SYNC_READ);
|
||||||
avl_add(&dl->dl_cache, dlce);
|
avl_add(&dl->dl_cache, dlce);
|
||||||
}
|
}
|
||||||
VERIFY3U(error, ==, ENOENT);
|
VERIFY3U(error, ==, ENOENT);
|
||||||
|
|
|
@ -1147,8 +1147,8 @@ spa_ld_log_sm_data(spa_t *spa)
|
||||||
/* Prefetch log spacemaps dnodes. */
|
/* Prefetch log spacemaps dnodes. */
|
||||||
for (sls = avl_first(&spa->spa_sm_logs_by_txg); sls;
|
for (sls = avl_first(&spa->spa_sm_logs_by_txg); sls;
|
||||||
sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
|
sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
|
||||||
dmu_prefetch(spa_meta_objset(spa), sls->sls_sm_obj,
|
dmu_prefetch_dnode(spa_meta_objset(spa), sls->sls_sm_obj,
|
||||||
0, 0, 0, ZIO_PRIORITY_SYNC_READ);
|
ZIO_PRIORITY_SYNC_READ);
|
||||||
}
|
}
|
||||||
|
|
||||||
uint_t pn = 0;
|
uint_t pn = 0;
|
||||||
|
|
|
@ -931,7 +931,7 @@ zvol_prefetch_minors_impl(void *arg)
|
||||||
job->error = dmu_objset_own(dsname, DMU_OST_ZVOL, B_TRUE, B_TRUE,
|
job->error = dmu_objset_own(dsname, DMU_OST_ZVOL, B_TRUE, B_TRUE,
|
||||||
FTAG, &os);
|
FTAG, &os);
|
||||||
if (job->error == 0) {
|
if (job->error == 0) {
|
||||||
dmu_prefetch(os, ZVOL_OBJ, 0, 0, 0, ZIO_PRIORITY_SYNC_READ);
|
dmu_prefetch_dnode(os, ZVOL_OBJ, ZIO_PRIORITY_SYNC_READ);
|
||||||
dmu_objset_disown(os, B_TRUE, FTAG);
|
dmu_objset_disown(os, B_TRUE, FTAG);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue