Split dmu_zfetch() speculation and execution parts
To make better predictions on parallel workloads dmu_zfetch() should be called as early as possible to reduce possible request reordering. In particular, it should be called before dmu_buf_hold_array_by_dnode() calls dbuf_hold(), which may sleep waiting for indirect blocks, waking up multiple threads same time on completion, that can significantly reorder the requests, making the stream look like random. But we should not issue prefetch requests before the on-demand ones, since they may get to the disks first despite the I/O scheduler, increasing on-demand request latency. This patch splits dmu_zfetch() into two functions: dmu_zfetch_prepare() and dmu_zfetch_run(). The first can be executed as early as needed. It only updates statistics and makes predictions without issuing any I/Os. The I/O issuance is handled by dmu_zfetch_run(), which can be called later when all on-demand I/Os are already issued. It even tracks the activity of other concurrent threads, issuing the prefetch only when _all_ on-demand requests are issued. For many years it was a big problem for storage servers, handling deeper request queues from their clients, having to either serialize consequential reads to make ZFS prefetcher usable, or execute the incoming requests as-is and get almost no prefetch from ZFS, relying only on deep enough prefetch by the clients. Benefits of those ways varied, but neither was perfect. With this patch deeper queue sequential read benchmarks with CrystalDiskMark from Windows via iSCSI to FreeBSD target show me much better throughput with almost 100% prefetcher hit rate, comparing to almost zero before. While there, I also removed per-stream zs_lock as useless, completely covered by parent zf_lock. Also I reused zs_blocks refcount to track zf_stream linkage of the stream, since I believe previous zs_fetch == NULL check in dmu_zfetch_stream_done() was racy. Delete prefetch streams when they reach ends of files. It saves up to 1KB of RAM per file, plus reduces searches through the stream list. Block data prefetch (speculation and indirect block prefetch is still done since they are cheaper) if all dbufs of the stream are already in DMU cache. First cache miss immediately fires all the prefetch that would be done for the stream by that time. It saves some CPU time if same files within DMU cache capacity are read over and over. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Adam Moss <c@yotes.com> Reviewed-by: Matthew Ahrens <mahrens@delphix.com> Signed-off-by: Alexander Motin <mav@FreeBSD.org> Sponsored-By: iXsystems, Inc. Closes #11652
This commit is contained in:
parent
296a4a369b
commit
891568c990
|
@ -49,20 +49,26 @@ typedef struct zfetch {
|
||||||
|
|
||||||
typedef struct zstream {
|
typedef struct zstream {
|
||||||
uint64_t zs_blkid; /* expect next access at this blkid */
|
uint64_t zs_blkid; /* expect next access at this blkid */
|
||||||
uint64_t zs_pf_blkid; /* next block to prefetch */
|
uint64_t zs_pf_blkid1; /* first block to prefetch */
|
||||||
|
uint64_t zs_pf_blkid; /* block to prefetch up to */
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* We will next prefetch the L1 indirect block of this level-0
|
* We will next prefetch the L1 indirect block of this level-0
|
||||||
* block id.
|
* block id.
|
||||||
*/
|
*/
|
||||||
uint64_t zs_ipf_blkid;
|
uint64_t zs_ipf_blkid1; /* first block to prefetch */
|
||||||
|
uint64_t zs_ipf_blkid; /* block to prefetch up to */
|
||||||
|
|
||||||
kmutex_t zs_lock; /* protects stream */
|
|
||||||
hrtime_t zs_atime; /* time last prefetch issued */
|
|
||||||
hrtime_t zs_start_time; /* start of last prefetch */
|
|
||||||
list_node_t zs_node; /* link for zf_stream */
|
list_node_t zs_node; /* link for zf_stream */
|
||||||
|
hrtime_t zs_atime; /* time last prefetch issued */
|
||||||
zfetch_t *zs_fetch; /* parent fetch */
|
zfetch_t *zs_fetch; /* parent fetch */
|
||||||
zfs_refcount_t zs_blocks; /* number of pending blocks in the stream */
|
boolean_t zs_missed; /* stream saw cache misses */
|
||||||
|
zfs_refcount_t zs_callers; /* number of pending callers */
|
||||||
|
/*
|
||||||
|
* Number of stream references: dnode, callers and pending blocks.
|
||||||
|
* The stream memory is freed when the number returns to zero.
|
||||||
|
*/
|
||||||
|
zfs_refcount_t zs_refs;
|
||||||
} zstream_t;
|
} zstream_t;
|
||||||
|
|
||||||
void zfetch_init(void);
|
void zfetch_init(void);
|
||||||
|
@ -70,7 +76,10 @@ void zfetch_fini(void);
|
||||||
|
|
||||||
void dmu_zfetch_init(zfetch_t *, struct dnode *);
|
void dmu_zfetch_init(zfetch_t *, struct dnode *);
|
||||||
void dmu_zfetch_fini(zfetch_t *);
|
void dmu_zfetch_fini(zfetch_t *);
|
||||||
void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, boolean_t,
|
zstream_t *dmu_zfetch_prepare(zfetch_t *, uint64_t, uint64_t, boolean_t,
|
||||||
|
boolean_t);
|
||||||
|
void dmu_zfetch_run(zstream_t *, boolean_t, boolean_t);
|
||||||
|
void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, boolean_t, boolean_t,
|
||||||
boolean_t);
|
boolean_t);
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1640,7 +1640,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
|
||||||
mutex_exit(&db->db_mtx);
|
mutex_exit(&db->db_mtx);
|
||||||
if (err == 0 && prefetch) {
|
if (err == 0 && prefetch) {
|
||||||
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
|
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
|
||||||
flags & DB_RF_HAVESTRUCT);
|
B_FALSE, flags & DB_RF_HAVESTRUCT);
|
||||||
}
|
}
|
||||||
DB_DNODE_EXIT(db);
|
DB_DNODE_EXIT(db);
|
||||||
DBUF_STAT_BUMP(hash_hits);
|
DBUF_STAT_BUMP(hash_hits);
|
||||||
|
@ -1662,6 +1662,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
|
||||||
*/
|
*/
|
||||||
if (!err && prefetch) {
|
if (!err && prefetch) {
|
||||||
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
|
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
|
||||||
|
db->db_state != DB_CACHED,
|
||||||
flags & DB_RF_HAVESTRUCT);
|
flags & DB_RF_HAVESTRUCT);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1691,7 +1692,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
|
||||||
mutex_exit(&db->db_mtx);
|
mutex_exit(&db->db_mtx);
|
||||||
if (prefetch) {
|
if (prefetch) {
|
||||||
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
|
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
|
||||||
flags & DB_RF_HAVESTRUCT);
|
B_TRUE, flags & DB_RF_HAVESTRUCT);
|
||||||
}
|
}
|
||||||
DB_DNODE_EXIT(db);
|
DB_DNODE_EXIT(db);
|
||||||
DBUF_STAT_BUMP(hash_misses);
|
DBUF_STAT_BUMP(hash_misses);
|
||||||
|
|
|
@ -497,10 +497,12 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
|
||||||
boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
|
boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
|
||||||
{
|
{
|
||||||
dmu_buf_t **dbp;
|
dmu_buf_t **dbp;
|
||||||
|
zstream_t *zs = NULL;
|
||||||
uint64_t blkid, nblks, i;
|
uint64_t blkid, nblks, i;
|
||||||
uint32_t dbuf_flags;
|
uint32_t dbuf_flags;
|
||||||
int err;
|
int err;
|
||||||
zio_t *zio = NULL;
|
zio_t *zio = NULL;
|
||||||
|
boolean_t missed = B_FALSE;
|
||||||
|
|
||||||
ASSERT(length <= DMU_MAX_ACCESS);
|
ASSERT(length <= DMU_MAX_ACCESS);
|
||||||
|
|
||||||
|
@ -536,9 +538,21 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
|
||||||
zio = zio_root(dn->dn_objset->os_spa, NULL, NULL,
|
zio = zio_root(dn->dn_objset->os_spa, NULL, NULL,
|
||||||
ZIO_FLAG_CANFAIL);
|
ZIO_FLAG_CANFAIL);
|
||||||
blkid = dbuf_whichblock(dn, 0, offset);
|
blkid = dbuf_whichblock(dn, 0, offset);
|
||||||
|
if ((flags & DMU_READ_NO_PREFETCH) == 0 &&
|
||||||
|
DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) {
|
||||||
|
/*
|
||||||
|
* Prepare the zfetch before initiating the demand reads, so
|
||||||
|
* that if multiple threads block on same indirect block, we
|
||||||
|
* base predictions on the original less racy request order.
|
||||||
|
*/
|
||||||
|
zs = dmu_zfetch_prepare(&dn->dn_zfetch, blkid, nblks,
|
||||||
|
read && DNODE_IS_CACHEABLE(dn), B_TRUE);
|
||||||
|
}
|
||||||
for (i = 0; i < nblks; i++) {
|
for (i = 0; i < nblks; i++) {
|
||||||
dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag);
|
dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag);
|
||||||
if (db == NULL) {
|
if (db == NULL) {
|
||||||
|
if (zs)
|
||||||
|
dmu_zfetch_run(zs, missed, B_TRUE);
|
||||||
rw_exit(&dn->dn_struct_rwlock);
|
rw_exit(&dn->dn_struct_rwlock);
|
||||||
dmu_buf_rele_array(dbp, nblks, tag);
|
dmu_buf_rele_array(dbp, nblks, tag);
|
||||||
if (read)
|
if (read)
|
||||||
|
@ -546,20 +560,27 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
|
||||||
return (SET_ERROR(EIO));
|
return (SET_ERROR(EIO));
|
||||||
}
|
}
|
||||||
|
|
||||||
/* initiate async i/o */
|
/*
|
||||||
if (read)
|
* Initiate async demand data read.
|
||||||
|
* We check the db_state after calling dbuf_read() because
|
||||||
|
* (1) dbuf_read() may change the state to CACHED due to a
|
||||||
|
* hit in the ARC, and (2) on a cache miss, a child will
|
||||||
|
* have been added to "zio" but not yet completed, so the
|
||||||
|
* state will not yet be CACHED.
|
||||||
|
*/
|
||||||
|
if (read) {
|
||||||
(void) dbuf_read(db, zio, dbuf_flags);
|
(void) dbuf_read(db, zio, dbuf_flags);
|
||||||
|
if (db->db_state != DB_CACHED)
|
||||||
|
missed = B_TRUE;
|
||||||
|
}
|
||||||
dbp[i] = &db->db;
|
dbp[i] = &db->db;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!read)
|
if (!read)
|
||||||
zfs_racct_write(length, nblks);
|
zfs_racct_write(length, nblks);
|
||||||
|
|
||||||
if ((flags & DMU_READ_NO_PREFETCH) == 0 &&
|
if (zs)
|
||||||
DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) {
|
dmu_zfetch_run(zs, missed, B_TRUE);
|
||||||
dmu_zfetch(&dn->dn_zfetch, blkid, nblks,
|
|
||||||
read && DNODE_IS_CACHEABLE(dn), B_TRUE);
|
|
||||||
}
|
|
||||||
rw_exit(&dn->dn_struct_rwlock);
|
rw_exit(&dn->dn_struct_rwlock);
|
||||||
|
|
||||||
if (read) {
|
if (read) {
|
||||||
|
|
|
@ -59,8 +59,6 @@ typedef struct zfetch_stats {
|
||||||
kstat_named_t zfetchstat_hits;
|
kstat_named_t zfetchstat_hits;
|
||||||
kstat_named_t zfetchstat_misses;
|
kstat_named_t zfetchstat_misses;
|
||||||
kstat_named_t zfetchstat_max_streams;
|
kstat_named_t zfetchstat_max_streams;
|
||||||
kstat_named_t zfetchstat_max_completion_us;
|
|
||||||
kstat_named_t zfetchstat_last_completion_us;
|
|
||||||
kstat_named_t zfetchstat_io_issued;
|
kstat_named_t zfetchstat_io_issued;
|
||||||
} zfetch_stats_t;
|
} zfetch_stats_t;
|
||||||
|
|
||||||
|
@ -68,8 +66,6 @@ static zfetch_stats_t zfetch_stats = {
|
||||||
{ "hits", KSTAT_DATA_UINT64 },
|
{ "hits", KSTAT_DATA_UINT64 },
|
||||||
{ "misses", KSTAT_DATA_UINT64 },
|
{ "misses", KSTAT_DATA_UINT64 },
|
||||||
{ "max_streams", KSTAT_DATA_UINT64 },
|
{ "max_streams", KSTAT_DATA_UINT64 },
|
||||||
{ "max_completion_us", KSTAT_DATA_UINT64 },
|
|
||||||
{ "last_completion_us", KSTAT_DATA_UINT64 },
|
|
||||||
{ "io_issued", KSTAT_DATA_UINT64 },
|
{ "io_issued", KSTAT_DATA_UINT64 },
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -129,7 +125,7 @@ dmu_zfetch_init(zfetch_t *zf, dnode_t *dno)
|
||||||
static void
|
static void
|
||||||
dmu_zfetch_stream_fini(zstream_t *zs)
|
dmu_zfetch_stream_fini(zstream_t *zs)
|
||||||
{
|
{
|
||||||
mutex_destroy(&zs->zs_lock);
|
ASSERT(!list_link_active(&zs->zs_node));
|
||||||
kmem_free(zs, sizeof (*zs));
|
kmem_free(zs, sizeof (*zs));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -138,17 +134,10 @@ dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs)
|
||||||
{
|
{
|
||||||
ASSERT(MUTEX_HELD(&zf->zf_lock));
|
ASSERT(MUTEX_HELD(&zf->zf_lock));
|
||||||
list_remove(&zf->zf_stream, zs);
|
list_remove(&zf->zf_stream, zs);
|
||||||
|
zf->zf_numstreams--;
|
||||||
|
membar_producer();
|
||||||
|
if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0)
|
||||||
dmu_zfetch_stream_fini(zs);
|
dmu_zfetch_stream_fini(zs);
|
||||||
zf->zf_numstreams--;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void
|
|
||||||
dmu_zfetch_stream_orphan(zfetch_t *zf, zstream_t *zs)
|
|
||||||
{
|
|
||||||
ASSERT(MUTEX_HELD(&zf->zf_lock));
|
|
||||||
list_remove(&zf->zf_stream, zs);
|
|
||||||
zs->zs_fetch = NULL;
|
|
||||||
zf->zf_numstreams--;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -161,12 +150,8 @@ dmu_zfetch_fini(zfetch_t *zf)
|
||||||
zstream_t *zs;
|
zstream_t *zs;
|
||||||
|
|
||||||
mutex_enter(&zf->zf_lock);
|
mutex_enter(&zf->zf_lock);
|
||||||
while ((zs = list_head(&zf->zf_stream)) != NULL) {
|
while ((zs = list_head(&zf->zf_stream)) != NULL)
|
||||||
if (zfs_refcount_count(&zs->zs_blocks) != 0)
|
|
||||||
dmu_zfetch_stream_orphan(zf, zs);
|
|
||||||
else
|
|
||||||
dmu_zfetch_stream_remove(zf, zs);
|
dmu_zfetch_stream_remove(zf, zs);
|
||||||
}
|
|
||||||
mutex_exit(&zf->zf_lock);
|
mutex_exit(&zf->zf_lock);
|
||||||
list_destroy(&zf->zf_stream);
|
list_destroy(&zf->zf_stream);
|
||||||
mutex_destroy(&zf->zf_lock);
|
mutex_destroy(&zf->zf_lock);
|
||||||
|
@ -195,9 +180,9 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
|
||||||
zs != NULL; zs = zs_next) {
|
zs != NULL; zs = zs_next) {
|
||||||
zs_next = list_next(&zf->zf_stream, zs);
|
zs_next = list_next(&zf->zf_stream, zs);
|
||||||
/*
|
/*
|
||||||
* Skip gethrtime() call if there are still references
|
* Skip if still active. 1 -- zf_stream reference.
|
||||||
*/
|
*/
|
||||||
if (zfs_refcount_count(&zs->zs_blocks) != 0)
|
if (zfs_refcount_count(&zs->zs_refs) != 1)
|
||||||
continue;
|
continue;
|
||||||
if (((now - zs->zs_atime) / NANOSEC) >
|
if (((now - zs->zs_atime) / NANOSEC) >
|
||||||
zfetch_min_sec_reap)
|
zfetch_min_sec_reap)
|
||||||
|
@ -222,12 +207,17 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
|
||||||
|
|
||||||
zstream_t *zs = kmem_zalloc(sizeof (*zs), KM_SLEEP);
|
zstream_t *zs = kmem_zalloc(sizeof (*zs), KM_SLEEP);
|
||||||
zs->zs_blkid = blkid;
|
zs->zs_blkid = blkid;
|
||||||
|
zs->zs_pf_blkid1 = blkid;
|
||||||
zs->zs_pf_blkid = blkid;
|
zs->zs_pf_blkid = blkid;
|
||||||
|
zs->zs_ipf_blkid1 = blkid;
|
||||||
zs->zs_ipf_blkid = blkid;
|
zs->zs_ipf_blkid = blkid;
|
||||||
zs->zs_atime = now;
|
zs->zs_atime = now;
|
||||||
zs->zs_fetch = zf;
|
zs->zs_fetch = zf;
|
||||||
zfs_refcount_create(&zs->zs_blocks);
|
zs->zs_missed = B_FALSE;
|
||||||
mutex_init(&zs->zs_lock, NULL, MUTEX_DEFAULT, NULL);
|
zfs_refcount_create(&zs->zs_callers);
|
||||||
|
zfs_refcount_create(&zs->zs_refs);
|
||||||
|
/* One reference for zf_stream. */
|
||||||
|
zfs_refcount_add(&zs->zs_refs, NULL);
|
||||||
zf->zf_numstreams++;
|
zf->zf_numstreams++;
|
||||||
list_insert_head(&zf->zf_stream, zs);
|
list_insert_head(&zf->zf_stream, zs);
|
||||||
}
|
}
|
||||||
|
@ -237,48 +227,36 @@ dmu_zfetch_stream_done(void *arg, boolean_t io_issued)
|
||||||
{
|
{
|
||||||
zstream_t *zs = arg;
|
zstream_t *zs = arg;
|
||||||
|
|
||||||
if (zs->zs_start_time && io_issued) {
|
if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0)
|
||||||
hrtime_t now = gethrtime();
|
|
||||||
hrtime_t delta = NSEC2USEC(now - zs->zs_start_time);
|
|
||||||
|
|
||||||
zs->zs_start_time = 0;
|
|
||||||
ZFETCHSTAT_SET(zfetchstat_last_completion_us, delta);
|
|
||||||
if (delta > ZFETCHSTAT_GET(zfetchstat_max_completion_us))
|
|
||||||
ZFETCHSTAT_SET(zfetchstat_max_completion_us, delta);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (zfs_refcount_remove(&zs->zs_blocks, NULL) != 0)
|
|
||||||
return;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* The parent fetch structure has gone away
|
|
||||||
*/
|
|
||||||
if (zs->zs_fetch == NULL)
|
|
||||||
dmu_zfetch_stream_fini(zs);
|
dmu_zfetch_stream_fini(zs);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* This is the predictive prefetch entry point. It associates dnode access
|
* This is the predictive prefetch entry point. dmu_zfetch_prepare()
|
||||||
* specified with blkid and nblks arguments with prefetch stream, predicts
|
* associates dnode access specified with blkid and nblks arguments with
|
||||||
* further accesses based on that stats and initiates speculative prefetch.
|
* prefetch stream, predicts further accesses based on that stats and returns
|
||||||
|
* the stream pointer on success. That pointer must later be passed to
|
||||||
|
* dmu_zfetch_run() to initiate the speculative prefetch for the stream and
|
||||||
|
* release it. dmu_zfetch() is a wrapper for simple cases when window between
|
||||||
|
* prediction and prefetch initiation is not needed.
|
||||||
* fetch_data argument specifies whether actual data blocks should be fetched:
|
* fetch_data argument specifies whether actual data blocks should be fetched:
|
||||||
* FALSE -- prefetch only indirect blocks for predicted data blocks;
|
* FALSE -- prefetch only indirect blocks for predicted data blocks;
|
||||||
* TRUE -- prefetch predicted data blocks plus following indirect blocks.
|
* TRUE -- prefetch predicted data blocks plus following indirect blocks.
|
||||||
*/
|
*/
|
||||||
void
|
zstream_t *
|
||||||
dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
|
dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
|
||||||
boolean_t have_lock)
|
boolean_t fetch_data, boolean_t have_lock)
|
||||||
{
|
{
|
||||||
zstream_t *zs;
|
zstream_t *zs;
|
||||||
int64_t pf_start, ipf_start, ipf_istart, ipf_iend;
|
int64_t pf_start, ipf_start;
|
||||||
int64_t pf_ahead_blks, max_blks;
|
int64_t pf_ahead_blks, max_blks;
|
||||||
int epbs, max_dist_blks, pf_nblks, ipf_nblks, issued;
|
int max_dist_blks, pf_nblks, ipf_nblks;
|
||||||
uint64_t end_of_access_blkid;
|
uint64_t end_of_access_blkid, maxblkid;
|
||||||
end_of_access_blkid = blkid + nblks;
|
end_of_access_blkid = blkid + nblks;
|
||||||
spa_t *spa = zf->zf_dnode->dn_objset->os_spa;
|
spa_t *spa = zf->zf_dnode->dn_objset->os_spa;
|
||||||
|
|
||||||
if (zfs_prefetch_disable)
|
if (zfs_prefetch_disable)
|
||||||
return;
|
return (NULL);
|
||||||
/*
|
/*
|
||||||
* If we haven't yet loaded the indirect vdevs' mappings, we
|
* If we haven't yet loaded the indirect vdevs' mappings, we
|
||||||
* can only read from blocks that we carefully ensure are on
|
* can only read from blocks that we carefully ensure are on
|
||||||
|
@ -287,14 +265,14 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
|
||||||
* blocks (e.g. of the MOS's dnode object).
|
* blocks (e.g. of the MOS's dnode object).
|
||||||
*/
|
*/
|
||||||
if (!spa_indirect_vdevs_loaded(spa))
|
if (!spa_indirect_vdevs_loaded(spa))
|
||||||
return;
|
return (NULL);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* As a fast path for small (single-block) files, ignore access
|
* As a fast path for small (single-block) files, ignore access
|
||||||
* to the first block.
|
* to the first block.
|
||||||
*/
|
*/
|
||||||
if (!have_lock && blkid == 0)
|
if (!have_lock && blkid == 0)
|
||||||
return;
|
return (NULL);
|
||||||
|
|
||||||
if (!have_lock)
|
if (!have_lock)
|
||||||
rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER);
|
rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER);
|
||||||
|
@ -303,10 +281,11 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
|
||||||
* A fast path for small files for which no prefetch will
|
* A fast path for small files for which no prefetch will
|
||||||
* happen.
|
* happen.
|
||||||
*/
|
*/
|
||||||
if (zf->zf_dnode->dn_maxblkid < 2) {
|
maxblkid = zf->zf_dnode->dn_maxblkid;
|
||||||
|
if (maxblkid < 2) {
|
||||||
if (!have_lock)
|
if (!have_lock)
|
||||||
rw_exit(&zf->zf_dnode->dn_struct_rwlock);
|
rw_exit(&zf->zf_dnode->dn_struct_rwlock);
|
||||||
return;
|
return (NULL);
|
||||||
}
|
}
|
||||||
mutex_enter(&zf->zf_lock);
|
mutex_enter(&zf->zf_lock);
|
||||||
|
|
||||||
|
@ -317,31 +296,34 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
|
||||||
*/
|
*/
|
||||||
for (zs = list_head(&zf->zf_stream); zs != NULL;
|
for (zs = list_head(&zf->zf_stream); zs != NULL;
|
||||||
zs = list_next(&zf->zf_stream, zs)) {
|
zs = list_next(&zf->zf_stream, zs)) {
|
||||||
if (blkid == zs->zs_blkid || blkid + 1 == zs->zs_blkid) {
|
|
||||||
mutex_enter(&zs->zs_lock);
|
|
||||||
/*
|
|
||||||
* zs_blkid could have changed before we
|
|
||||||
* acquired zs_lock; re-check them here.
|
|
||||||
*/
|
|
||||||
if (blkid == zs->zs_blkid) {
|
if (blkid == zs->zs_blkid) {
|
||||||
break;
|
break;
|
||||||
} else if (blkid + 1 == zs->zs_blkid) {
|
} else if (blkid + 1 == zs->zs_blkid) {
|
||||||
blkid++;
|
blkid++;
|
||||||
nblks--;
|
nblks--;
|
||||||
if (nblks == 0) {
|
|
||||||
/* Already prefetched this before. */
|
|
||||||
mutex_exit(&zs->zs_lock);
|
|
||||||
mutex_exit(&zf->zf_lock);
|
|
||||||
if (!have_lock) {
|
|
||||||
rw_exit(&zf->zf_dnode->
|
|
||||||
dn_struct_rwlock);
|
|
||||||
}
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
mutex_exit(&zs->zs_lock);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If the file is ending, remove the matching stream if found.
|
||||||
|
* If not found then it is too late to create a new one now.
|
||||||
|
*/
|
||||||
|
if (end_of_access_blkid >= maxblkid) {
|
||||||
|
if (zs != NULL)
|
||||||
|
dmu_zfetch_stream_remove(zf, zs);
|
||||||
|
mutex_exit(&zf->zf_lock);
|
||||||
|
if (!have_lock)
|
||||||
|
rw_exit(&zf->zf_dnode->dn_struct_rwlock);
|
||||||
|
return (NULL);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Exit if we already prefetched this block before. */
|
||||||
|
if (nblks == 0) {
|
||||||
|
mutex_exit(&zf->zf_lock);
|
||||||
|
if (!have_lock)
|
||||||
|
rw_exit(&zf->zf_dnode->dn_struct_rwlock);
|
||||||
|
return (NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (zs == NULL) {
|
if (zs == NULL) {
|
||||||
|
@ -349,13 +331,12 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
|
||||||
* This access is not part of any existing stream. Create
|
* This access is not part of any existing stream. Create
|
||||||
* a new stream for it.
|
* a new stream for it.
|
||||||
*/
|
*/
|
||||||
ZFETCHSTAT_BUMP(zfetchstat_misses);
|
|
||||||
|
|
||||||
dmu_zfetch_stream_create(zf, end_of_access_blkid);
|
dmu_zfetch_stream_create(zf, end_of_access_blkid);
|
||||||
mutex_exit(&zf->zf_lock);
|
mutex_exit(&zf->zf_lock);
|
||||||
if (!have_lock)
|
if (!have_lock)
|
||||||
rw_exit(&zf->zf_dnode->dn_struct_rwlock);
|
rw_exit(&zf->zf_dnode->dn_struct_rwlock);
|
||||||
return;
|
ZFETCHSTAT_BUMP(zfetchstat_misses);
|
||||||
|
return (NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -369,6 +350,10 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
|
||||||
* start just after the block we just accessed.
|
* start just after the block we just accessed.
|
||||||
*/
|
*/
|
||||||
pf_start = MAX(zs->zs_pf_blkid, end_of_access_blkid);
|
pf_start = MAX(zs->zs_pf_blkid, end_of_access_blkid);
|
||||||
|
if (zs->zs_pf_blkid1 < end_of_access_blkid)
|
||||||
|
zs->zs_pf_blkid1 = end_of_access_blkid;
|
||||||
|
if (zs->zs_ipf_blkid1 < end_of_access_blkid)
|
||||||
|
zs->zs_ipf_blkid1 = end_of_access_blkid;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Double our amount of prefetched data, but don't let the
|
* Double our amount of prefetched data, but don't let the
|
||||||
|
@ -407,49 +392,108 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
|
||||||
* (i.e. the amount read now + the amount of data prefetched now).
|
* (i.e. the amount read now + the amount of data prefetched now).
|
||||||
*/
|
*/
|
||||||
pf_ahead_blks = zs->zs_ipf_blkid - blkid + nblks + pf_nblks;
|
pf_ahead_blks = zs->zs_ipf_blkid - blkid + nblks + pf_nblks;
|
||||||
max_blks = max_dist_blks - (ipf_start - end_of_access_blkid);
|
max_blks = max_dist_blks - (ipf_start - zs->zs_pf_blkid);
|
||||||
ipf_nblks = MIN(pf_ahead_blks, max_blks);
|
ipf_nblks = MIN(pf_ahead_blks, max_blks);
|
||||||
zs->zs_ipf_blkid = ipf_start + ipf_nblks;
|
zs->zs_ipf_blkid = ipf_start + ipf_nblks;
|
||||||
|
|
||||||
epbs = zf->zf_dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
|
|
||||||
ipf_istart = P2ROUNDUP(ipf_start, 1 << epbs) >> epbs;
|
|
||||||
ipf_iend = P2ROUNDUP(zs->zs_ipf_blkid, 1 << epbs) >> epbs;
|
|
||||||
|
|
||||||
zs->zs_atime = gethrtime();
|
|
||||||
/* no prior reads in progress */
|
|
||||||
if (zfs_refcount_count(&zs->zs_blocks) == 0)
|
|
||||||
zs->zs_start_time = zs->zs_atime;
|
|
||||||
zs->zs_blkid = end_of_access_blkid;
|
zs->zs_blkid = end_of_access_blkid;
|
||||||
zfs_refcount_add_many(&zs->zs_blocks, pf_nblks + ipf_iend - ipf_istart,
|
/* Protect the stream from reclamation. */
|
||||||
NULL);
|
zs->zs_atime = gethrtime();
|
||||||
mutex_exit(&zs->zs_lock);
|
zfs_refcount_add(&zs->zs_refs, NULL);
|
||||||
|
/* Count concurrent callers. */
|
||||||
|
zfs_refcount_add(&zs->zs_callers, NULL);
|
||||||
mutex_exit(&zf->zf_lock);
|
mutex_exit(&zf->zf_lock);
|
||||||
issued = 0;
|
|
||||||
|
if (!have_lock)
|
||||||
|
rw_exit(&zf->zf_dnode->dn_struct_rwlock);
|
||||||
|
|
||||||
|
ZFETCHSTAT_BUMP(zfetchstat_hits);
|
||||||
|
return (zs);
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
dmu_zfetch_run(zstream_t *zs, boolean_t missed, boolean_t have_lock)
|
||||||
|
{
|
||||||
|
zfetch_t *zf = zs->zs_fetch;
|
||||||
|
int64_t pf_start, pf_end, ipf_start, ipf_end;
|
||||||
|
int epbs, issued;
|
||||||
|
|
||||||
|
if (missed)
|
||||||
|
zs->zs_missed = missed;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* dbuf_prefetch() is asynchronous (even when it needs to read
|
* Postpone the prefetch if there are more concurrent callers.
|
||||||
* indirect blocks), but we still prefer to drop our locks before
|
* It happens when multiple requests are waiting for the same
|
||||||
* calling it to reduce the time we hold them.
|
* indirect block. The last one will run the prefetch for all.
|
||||||
*/
|
*/
|
||||||
|
if (zfs_refcount_remove(&zs->zs_callers, NULL) != 0) {
|
||||||
|
/* Drop reference taken in dmu_zfetch_prepare(). */
|
||||||
|
if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0)
|
||||||
|
dmu_zfetch_stream_fini(zs);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
for (int i = 0; i < pf_nblks; i++) {
|
mutex_enter(&zf->zf_lock);
|
||||||
issued += dbuf_prefetch_impl(zf->zf_dnode, 0, pf_start + i,
|
if (zs->zs_missed) {
|
||||||
|
pf_start = zs->zs_pf_blkid1;
|
||||||
|
pf_end = zs->zs_pf_blkid1 = zs->zs_pf_blkid;
|
||||||
|
} else {
|
||||||
|
pf_start = pf_end = 0;
|
||||||
|
}
|
||||||
|
ipf_start = MAX(zs->zs_pf_blkid1, zs->zs_ipf_blkid1);
|
||||||
|
ipf_end = zs->zs_ipf_blkid1 = zs->zs_ipf_blkid;
|
||||||
|
mutex_exit(&zf->zf_lock);
|
||||||
|
ASSERT3S(pf_start, <=, pf_end);
|
||||||
|
ASSERT3S(ipf_start, <=, ipf_end);
|
||||||
|
|
||||||
|
epbs = zf->zf_dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
|
||||||
|
ipf_start = P2ROUNDUP(ipf_start, 1 << epbs) >> epbs;
|
||||||
|
ipf_end = P2ROUNDUP(ipf_end, 1 << epbs) >> epbs;
|
||||||
|
ASSERT3S(ipf_start, <=, ipf_end);
|
||||||
|
issued = pf_end - pf_start + ipf_end - ipf_start;
|
||||||
|
if (issued > 1) {
|
||||||
|
/* More references on top of taken in dmu_zfetch_prepare(). */
|
||||||
|
zfs_refcount_add_many(&zs->zs_refs, issued - 1, NULL);
|
||||||
|
} else if (issued == 0) {
|
||||||
|
/* Some other thread has done our work, so drop the ref. */
|
||||||
|
if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0)
|
||||||
|
dmu_zfetch_stream_fini(zs);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!have_lock)
|
||||||
|
rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER);
|
||||||
|
|
||||||
|
issued = 0;
|
||||||
|
for (int64_t blk = pf_start; blk < pf_end; blk++) {
|
||||||
|
issued += dbuf_prefetch_impl(zf->zf_dnode, 0, blk,
|
||||||
ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH,
|
ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH,
|
||||||
dmu_zfetch_stream_done, zs);
|
dmu_zfetch_stream_done, zs);
|
||||||
}
|
}
|
||||||
for (int64_t iblk = ipf_istart; iblk < ipf_iend; iblk++) {
|
for (int64_t iblk = ipf_start; iblk < ipf_end; iblk++) {
|
||||||
issued += dbuf_prefetch_impl(zf->zf_dnode, 1, iblk,
|
issued += dbuf_prefetch_impl(zf->zf_dnode, 1, iblk,
|
||||||
ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH,
|
ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH,
|
||||||
dmu_zfetch_stream_done, zs);
|
dmu_zfetch_stream_done, zs);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!have_lock)
|
if (!have_lock)
|
||||||
rw_exit(&zf->zf_dnode->dn_struct_rwlock);
|
rw_exit(&zf->zf_dnode->dn_struct_rwlock);
|
||||||
ZFETCHSTAT_BUMP(zfetchstat_hits);
|
|
||||||
|
|
||||||
if (issued)
|
if (issued)
|
||||||
ZFETCHSTAT_ADD(zfetchstat_io_issued, issued);
|
ZFETCHSTAT_ADD(zfetchstat_io_issued, issued);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
|
||||||
|
boolean_t missed, boolean_t have_lock)
|
||||||
|
{
|
||||||
|
zstream_t *zs;
|
||||||
|
|
||||||
|
zs = dmu_zfetch_prepare(zf, blkid, nblks, fetch_data, have_lock);
|
||||||
|
if (zs)
|
||||||
|
dmu_zfetch_run(zs, missed, have_lock);
|
||||||
|
}
|
||||||
|
|
||||||
/* BEGIN CSTYLED */
|
/* BEGIN CSTYLED */
|
||||||
ZFS_MODULE_PARAM(zfs_prefetch, zfs_prefetch_, disable, INT, ZMOD_RW,
|
ZFS_MODULE_PARAM(zfs_prefetch, zfs_prefetch_, disable, INT, ZMOD_RW,
|
||||||
"Disable all ZFS prefetching");
|
"Disable all ZFS prefetching");
|
||||||
|
|
Loading…
Reference in New Issue