diff --git a/include/sys/dmu_zfetch.h b/include/sys/dmu_zfetch.h index 34b711fc06..4c220b0c79 100644 --- a/include/sys/dmu_zfetch.h +++ b/include/sys/dmu_zfetch.h @@ -49,20 +49,26 @@ typedef struct zfetch { typedef struct zstream { uint64_t zs_blkid; /* expect next access at this blkid */ - uint64_t zs_pf_blkid; /* next block to prefetch */ + uint64_t zs_pf_blkid1; /* first block to prefetch */ + uint64_t zs_pf_blkid; /* block to prefetch up to */ /* * We will next prefetch the L1 indirect block of this level-0 * block id. */ - uint64_t zs_ipf_blkid; + uint64_t zs_ipf_blkid1; /* first block to prefetch */ + uint64_t zs_ipf_blkid; /* block to prefetch up to */ - kmutex_t zs_lock; /* protects stream */ - hrtime_t zs_atime; /* time last prefetch issued */ - hrtime_t zs_start_time; /* start of last prefetch */ list_node_t zs_node; /* link for zf_stream */ + hrtime_t zs_atime; /* time last prefetch issued */ zfetch_t *zs_fetch; /* parent fetch */ - zfs_refcount_t zs_blocks; /* number of pending blocks in the stream */ + boolean_t zs_missed; /* stream saw cache misses */ + zfs_refcount_t zs_callers; /* number of pending callers */ + /* + * Number of stream references: dnode, callers and pending blocks. + * The stream memory is freed when the number returns to zero. + */ + zfs_refcount_t zs_refs; } zstream_t; void zfetch_init(void); @@ -70,7 +76,10 @@ void zfetch_fini(void); void dmu_zfetch_init(zfetch_t *, struct dnode *); void dmu_zfetch_fini(zfetch_t *); -void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, boolean_t, +zstream_t *dmu_zfetch_prepare(zfetch_t *, uint64_t, uint64_t, boolean_t, + boolean_t); +void dmu_zfetch_run(zstream_t *, boolean_t, boolean_t); +void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, boolean_t, boolean_t, boolean_t); diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index a6cdc017cd..d48dc7943a 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -1640,7 +1640,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) mutex_exit(&db->db_mtx); if (err == 0 && prefetch) { dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, - flags & DB_RF_HAVESTRUCT); + B_FALSE, flags & DB_RF_HAVESTRUCT); } DB_DNODE_EXIT(db); DBUF_STAT_BUMP(hash_hits); @@ -1662,6 +1662,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) */ if (!err && prefetch) { dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, + db->db_state != DB_CACHED, flags & DB_RF_HAVESTRUCT); } @@ -1691,7 +1692,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) mutex_exit(&db->db_mtx); if (prefetch) { dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, - flags & DB_RF_HAVESTRUCT); + B_TRUE, flags & DB_RF_HAVESTRUCT); } DB_DNODE_EXIT(db); DBUF_STAT_BUMP(hash_misses); diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index b46bf60d1a..1c47430953 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -497,10 +497,12 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags) { dmu_buf_t **dbp; + zstream_t *zs = NULL; uint64_t blkid, nblks, i; uint32_t dbuf_flags; int err; zio_t *zio = NULL; + boolean_t missed = B_FALSE; ASSERT(length <= DMU_MAX_ACCESS); @@ -536,9 +538,21 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); blkid = dbuf_whichblock(dn, 0, offset); + if ((flags & DMU_READ_NO_PREFETCH) == 0 && + DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) { + /* + * Prepare the zfetch before initiating the demand reads, so + * that if multiple threads block on same indirect block, we + * base predictions on the original less racy request order. + */ + zs = dmu_zfetch_prepare(&dn->dn_zfetch, blkid, nblks, + read && DNODE_IS_CACHEABLE(dn), B_TRUE); + } for (i = 0; i < nblks; i++) { dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag); if (db == NULL) { + if (zs) + dmu_zfetch_run(zs, missed, B_TRUE); rw_exit(&dn->dn_struct_rwlock); dmu_buf_rele_array(dbp, nblks, tag); if (read) @@ -546,20 +560,27 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, return (SET_ERROR(EIO)); } - /* initiate async i/o */ - if (read) + /* + * Initiate async demand data read. + * We check the db_state after calling dbuf_read() because + * (1) dbuf_read() may change the state to CACHED due to a + * hit in the ARC, and (2) on a cache miss, a child will + * have been added to "zio" but not yet completed, so the + * state will not yet be CACHED. + */ + if (read) { (void) dbuf_read(db, zio, dbuf_flags); + if (db->db_state != DB_CACHED) + missed = B_TRUE; + } dbp[i] = &db->db; } if (!read) zfs_racct_write(length, nblks); - if ((flags & DMU_READ_NO_PREFETCH) == 0 && - DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) { - dmu_zfetch(&dn->dn_zfetch, blkid, nblks, - read && DNODE_IS_CACHEABLE(dn), B_TRUE); - } + if (zs) + dmu_zfetch_run(zs, missed, B_TRUE); rw_exit(&dn->dn_struct_rwlock); if (read) { diff --git a/module/zfs/dmu_zfetch.c b/module/zfs/dmu_zfetch.c index 5d061fe381..3d7407016d 100644 --- a/module/zfs/dmu_zfetch.c +++ b/module/zfs/dmu_zfetch.c @@ -59,8 +59,6 @@ typedef struct zfetch_stats { kstat_named_t zfetchstat_hits; kstat_named_t zfetchstat_misses; kstat_named_t zfetchstat_max_streams; - kstat_named_t zfetchstat_max_completion_us; - kstat_named_t zfetchstat_last_completion_us; kstat_named_t zfetchstat_io_issued; } zfetch_stats_t; @@ -68,8 +66,6 @@ static zfetch_stats_t zfetch_stats = { { "hits", KSTAT_DATA_UINT64 }, { "misses", KSTAT_DATA_UINT64 }, { "max_streams", KSTAT_DATA_UINT64 }, - { "max_completion_us", KSTAT_DATA_UINT64 }, - { "last_completion_us", KSTAT_DATA_UINT64 }, { "io_issued", KSTAT_DATA_UINT64 }, }; @@ -129,7 +125,7 @@ dmu_zfetch_init(zfetch_t *zf, dnode_t *dno) static void dmu_zfetch_stream_fini(zstream_t *zs) { - mutex_destroy(&zs->zs_lock); + ASSERT(!list_link_active(&zs->zs_node)); kmem_free(zs, sizeof (*zs)); } @@ -138,17 +134,10 @@ dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs) { ASSERT(MUTEX_HELD(&zf->zf_lock)); list_remove(&zf->zf_stream, zs); - dmu_zfetch_stream_fini(zs); - zf->zf_numstreams--; -} - -static void -dmu_zfetch_stream_orphan(zfetch_t *zf, zstream_t *zs) -{ - ASSERT(MUTEX_HELD(&zf->zf_lock)); - list_remove(&zf->zf_stream, zs); - zs->zs_fetch = NULL; zf->zf_numstreams--; + membar_producer(); + if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) + dmu_zfetch_stream_fini(zs); } /* @@ -161,12 +150,8 @@ dmu_zfetch_fini(zfetch_t *zf) zstream_t *zs; mutex_enter(&zf->zf_lock); - while ((zs = list_head(&zf->zf_stream)) != NULL) { - if (zfs_refcount_count(&zs->zs_blocks) != 0) - dmu_zfetch_stream_orphan(zf, zs); - else - dmu_zfetch_stream_remove(zf, zs); - } + while ((zs = list_head(&zf->zf_stream)) != NULL) + dmu_zfetch_stream_remove(zf, zs); mutex_exit(&zf->zf_lock); list_destroy(&zf->zf_stream); mutex_destroy(&zf->zf_lock); @@ -195,9 +180,9 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid) zs != NULL; zs = zs_next) { zs_next = list_next(&zf->zf_stream, zs); /* - * Skip gethrtime() call if there are still references + * Skip if still active. 1 -- zf_stream reference. */ - if (zfs_refcount_count(&zs->zs_blocks) != 0) + if (zfs_refcount_count(&zs->zs_refs) != 1) continue; if (((now - zs->zs_atime) / NANOSEC) > zfetch_min_sec_reap) @@ -222,12 +207,17 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid) zstream_t *zs = kmem_zalloc(sizeof (*zs), KM_SLEEP); zs->zs_blkid = blkid; + zs->zs_pf_blkid1 = blkid; zs->zs_pf_blkid = blkid; + zs->zs_ipf_blkid1 = blkid; zs->zs_ipf_blkid = blkid; zs->zs_atime = now; zs->zs_fetch = zf; - zfs_refcount_create(&zs->zs_blocks); - mutex_init(&zs->zs_lock, NULL, MUTEX_DEFAULT, NULL); + zs->zs_missed = B_FALSE; + zfs_refcount_create(&zs->zs_callers); + zfs_refcount_create(&zs->zs_refs); + /* One reference for zf_stream. */ + zfs_refcount_add(&zs->zs_refs, NULL); zf->zf_numstreams++; list_insert_head(&zf->zf_stream, zs); } @@ -237,48 +227,36 @@ dmu_zfetch_stream_done(void *arg, boolean_t io_issued) { zstream_t *zs = arg; - if (zs->zs_start_time && io_issued) { - hrtime_t now = gethrtime(); - hrtime_t delta = NSEC2USEC(now - zs->zs_start_time); - - zs->zs_start_time = 0; - ZFETCHSTAT_SET(zfetchstat_last_completion_us, delta); - if (delta > ZFETCHSTAT_GET(zfetchstat_max_completion_us)) - ZFETCHSTAT_SET(zfetchstat_max_completion_us, delta); - } - - if (zfs_refcount_remove(&zs->zs_blocks, NULL) != 0) - return; - - /* - * The parent fetch structure has gone away - */ - if (zs->zs_fetch == NULL) + if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) dmu_zfetch_stream_fini(zs); } /* - * This is the predictive prefetch entry point. It associates dnode access - * specified with blkid and nblks arguments with prefetch stream, predicts - * further accesses based on that stats and initiates speculative prefetch. + * This is the predictive prefetch entry point. dmu_zfetch_prepare() + * associates dnode access specified with blkid and nblks arguments with + * prefetch stream, predicts further accesses based on that stats and returns + * the stream pointer on success. That pointer must later be passed to + * dmu_zfetch_run() to initiate the speculative prefetch for the stream and + * release it. dmu_zfetch() is a wrapper for simple cases when window between + * prediction and prefetch initiation is not needed. * fetch_data argument specifies whether actual data blocks should be fetched: * FALSE -- prefetch only indirect blocks for predicted data blocks; * TRUE -- prefetch predicted data blocks plus following indirect blocks. */ -void -dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data, - boolean_t have_lock) +zstream_t * +dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks, + boolean_t fetch_data, boolean_t have_lock) { zstream_t *zs; - int64_t pf_start, ipf_start, ipf_istart, ipf_iend; + int64_t pf_start, ipf_start; int64_t pf_ahead_blks, max_blks; - int epbs, max_dist_blks, pf_nblks, ipf_nblks, issued; - uint64_t end_of_access_blkid; + int max_dist_blks, pf_nblks, ipf_nblks; + uint64_t end_of_access_blkid, maxblkid; end_of_access_blkid = blkid + nblks; spa_t *spa = zf->zf_dnode->dn_objset->os_spa; if (zfs_prefetch_disable) - return; + return (NULL); /* * If we haven't yet loaded the indirect vdevs' mappings, we * can only read from blocks that we carefully ensure are on @@ -287,14 +265,14 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data, * blocks (e.g. of the MOS's dnode object). */ if (!spa_indirect_vdevs_loaded(spa)) - return; + return (NULL); /* * As a fast path for small (single-block) files, ignore access * to the first block. */ if (!have_lock && blkid == 0) - return; + return (NULL); if (!have_lock) rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER); @@ -303,10 +281,11 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data, * A fast path for small files for which no prefetch will * happen. */ - if (zf->zf_dnode->dn_maxblkid < 2) { + maxblkid = zf->zf_dnode->dn_maxblkid; + if (maxblkid < 2) { if (!have_lock) rw_exit(&zf->zf_dnode->dn_struct_rwlock); - return; + return (NULL); } mutex_enter(&zf->zf_lock); @@ -317,45 +296,47 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data, */ for (zs = list_head(&zf->zf_stream); zs != NULL; zs = list_next(&zf->zf_stream, zs)) { - if (blkid == zs->zs_blkid || blkid + 1 == zs->zs_blkid) { - mutex_enter(&zs->zs_lock); - /* - * zs_blkid could have changed before we - * acquired zs_lock; re-check them here. - */ - if (blkid == zs->zs_blkid) { - break; - } else if (blkid + 1 == zs->zs_blkid) { - blkid++; - nblks--; - if (nblks == 0) { - /* Already prefetched this before. */ - mutex_exit(&zs->zs_lock); - mutex_exit(&zf->zf_lock); - if (!have_lock) { - rw_exit(&zf->zf_dnode-> - dn_struct_rwlock); - } - return; - } - break; - } - mutex_exit(&zs->zs_lock); + if (blkid == zs->zs_blkid) { + break; + } else if (blkid + 1 == zs->zs_blkid) { + blkid++; + nblks--; + break; } } + /* + * If the file is ending, remove the matching stream if found. + * If not found then it is too late to create a new one now. + */ + if (end_of_access_blkid >= maxblkid) { + if (zs != NULL) + dmu_zfetch_stream_remove(zf, zs); + mutex_exit(&zf->zf_lock); + if (!have_lock) + rw_exit(&zf->zf_dnode->dn_struct_rwlock); + return (NULL); + } + + /* Exit if we already prefetched this block before. */ + if (nblks == 0) { + mutex_exit(&zf->zf_lock); + if (!have_lock) + rw_exit(&zf->zf_dnode->dn_struct_rwlock); + return (NULL); + } + if (zs == NULL) { /* * This access is not part of any existing stream. Create * a new stream for it. */ - ZFETCHSTAT_BUMP(zfetchstat_misses); - dmu_zfetch_stream_create(zf, end_of_access_blkid); mutex_exit(&zf->zf_lock); if (!have_lock) rw_exit(&zf->zf_dnode->dn_struct_rwlock); - return; + ZFETCHSTAT_BUMP(zfetchstat_misses); + return (NULL); } /* @@ -369,6 +350,10 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data, * start just after the block we just accessed. */ pf_start = MAX(zs->zs_pf_blkid, end_of_access_blkid); + if (zs->zs_pf_blkid1 < end_of_access_blkid) + zs->zs_pf_blkid1 = end_of_access_blkid; + if (zs->zs_ipf_blkid1 < end_of_access_blkid) + zs->zs_ipf_blkid1 = end_of_access_blkid; /* * Double our amount of prefetched data, but don't let the @@ -407,49 +392,108 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data, * (i.e. the amount read now + the amount of data prefetched now). */ pf_ahead_blks = zs->zs_ipf_blkid - blkid + nblks + pf_nblks; - max_blks = max_dist_blks - (ipf_start - end_of_access_blkid); + max_blks = max_dist_blks - (ipf_start - zs->zs_pf_blkid); ipf_nblks = MIN(pf_ahead_blks, max_blks); zs->zs_ipf_blkid = ipf_start + ipf_nblks; - epbs = zf->zf_dnode->dn_indblkshift - SPA_BLKPTRSHIFT; - ipf_istart = P2ROUNDUP(ipf_start, 1 << epbs) >> epbs; - ipf_iend = P2ROUNDUP(zs->zs_ipf_blkid, 1 << epbs) >> epbs; - - zs->zs_atime = gethrtime(); - /* no prior reads in progress */ - if (zfs_refcount_count(&zs->zs_blocks) == 0) - zs->zs_start_time = zs->zs_atime; zs->zs_blkid = end_of_access_blkid; - zfs_refcount_add_many(&zs->zs_blocks, pf_nblks + ipf_iend - ipf_istart, - NULL); - mutex_exit(&zs->zs_lock); + /* Protect the stream from reclamation. */ + zs->zs_atime = gethrtime(); + zfs_refcount_add(&zs->zs_refs, NULL); + /* Count concurrent callers. */ + zfs_refcount_add(&zs->zs_callers, NULL); mutex_exit(&zf->zf_lock); - issued = 0; + + if (!have_lock) + rw_exit(&zf->zf_dnode->dn_struct_rwlock); + + ZFETCHSTAT_BUMP(zfetchstat_hits); + return (zs); +} + +void +dmu_zfetch_run(zstream_t *zs, boolean_t missed, boolean_t have_lock) +{ + zfetch_t *zf = zs->zs_fetch; + int64_t pf_start, pf_end, ipf_start, ipf_end; + int epbs, issued; + + if (missed) + zs->zs_missed = missed; /* - * dbuf_prefetch() is asynchronous (even when it needs to read - * indirect blocks), but we still prefer to drop our locks before - * calling it to reduce the time we hold them. + * Postpone the prefetch if there are more concurrent callers. + * It happens when multiple requests are waiting for the same + * indirect block. The last one will run the prefetch for all. */ + if (zfs_refcount_remove(&zs->zs_callers, NULL) != 0) { + /* Drop reference taken in dmu_zfetch_prepare(). */ + if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) + dmu_zfetch_stream_fini(zs); + return; + } - for (int i = 0; i < pf_nblks; i++) { - issued += dbuf_prefetch_impl(zf->zf_dnode, 0, pf_start + i, + mutex_enter(&zf->zf_lock); + if (zs->zs_missed) { + pf_start = zs->zs_pf_blkid1; + pf_end = zs->zs_pf_blkid1 = zs->zs_pf_blkid; + } else { + pf_start = pf_end = 0; + } + ipf_start = MAX(zs->zs_pf_blkid1, zs->zs_ipf_blkid1); + ipf_end = zs->zs_ipf_blkid1 = zs->zs_ipf_blkid; + mutex_exit(&zf->zf_lock); + ASSERT3S(pf_start, <=, pf_end); + ASSERT3S(ipf_start, <=, ipf_end); + + epbs = zf->zf_dnode->dn_indblkshift - SPA_BLKPTRSHIFT; + ipf_start = P2ROUNDUP(ipf_start, 1 << epbs) >> epbs; + ipf_end = P2ROUNDUP(ipf_end, 1 << epbs) >> epbs; + ASSERT3S(ipf_start, <=, ipf_end); + issued = pf_end - pf_start + ipf_end - ipf_start; + if (issued > 1) { + /* More references on top of taken in dmu_zfetch_prepare(). */ + zfs_refcount_add_many(&zs->zs_refs, issued - 1, NULL); + } else if (issued == 0) { + /* Some other thread has done our work, so drop the ref. */ + if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) + dmu_zfetch_stream_fini(zs); + return; + } + + if (!have_lock) + rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER); + + issued = 0; + for (int64_t blk = pf_start; blk < pf_end; blk++) { + issued += dbuf_prefetch_impl(zf->zf_dnode, 0, blk, ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH, dmu_zfetch_stream_done, zs); } - for (int64_t iblk = ipf_istart; iblk < ipf_iend; iblk++) { + for (int64_t iblk = ipf_start; iblk < ipf_end; iblk++) { issued += dbuf_prefetch_impl(zf->zf_dnode, 1, iblk, ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH, dmu_zfetch_stream_done, zs); } + if (!have_lock) rw_exit(&zf->zf_dnode->dn_struct_rwlock); - ZFETCHSTAT_BUMP(zfetchstat_hits); if (issued) ZFETCHSTAT_ADD(zfetchstat_io_issued, issued); } +void +dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data, + boolean_t missed, boolean_t have_lock) +{ + zstream_t *zs; + + zs = dmu_zfetch_prepare(zf, blkid, nblks, fetch_data, have_lock); + if (zs) + dmu_zfetch_run(zs, missed, have_lock); +} + /* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_prefetch, zfs_prefetch_, disable, INT, ZMOD_RW, "Disable all ZFS prefetching");