Illumos 5987 - zfs prefetch code needs work

5987 zfs prefetch code needs work
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Paul Dagnelie <pcd@delphix.com>
Approved by: Gordon Ross <gordon.ross@nexenta.com>

References:
  https://www.illumos.org/issues/5987 zfs prefetch code needs work
  illumos/illumos-gate@cf6106c 5987 zfs prefetch code needs work

Porting notes:
- [module/zfs/dbuf.c]
  - 5f6d0b6 Handle block pointers with a corrupt logical size
- [module/zfs/dmu_zfetch.c]
  - c65aa5b Fix gcc missing parenthesis warnings
  - 428870f Update core ZFS code from build 121 to build 141.
  - 79c76d5 Change KM_PUSHPAGE -> KM_SLEEP
  - b8d06fc Switch KM_SLEEP to KM_PUSHPAGE
  - Account for ISO C90 - mixed declarations and code - warnings
  - Module parameters (new/changed):
    - Replaced zfetch_block_cap with zfetch_max_distance
      (Max bytes to prefetch per stream (default 8MB; 8 * 1024 * 1024))
    - Preserved zfs_prefetch_disable as 'int' for consistency with
      existing Linux module options.
- [include/sys/trace_arc.h]
  - Added new tracepoints
    - DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__sync__wait__for__async);
    - DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__demand__hit__predictive__prefetch);
- [man/man5/zfs-module-parameters.5]
  - Updated man page

Ported-by: kernelOfTruth kerneloftruth@gmail.com
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
This commit is contained in:
Matthew Ahrens 2015-12-26 22:10:31 +01:00 committed by Brian Behlendorf
parent ab5cbbd107
commit 7f60329a26
10 changed files with 285 additions and 663 deletions

View File

@ -84,27 +84,31 @@ typedef enum arc_flags
ARC_FLAG_CACHED = 1 << 4, /* I/O was in cache */ ARC_FLAG_CACHED = 1 << 4, /* I/O was in cache */
ARC_FLAG_L2CACHE = 1 << 5, /* cache in L2ARC */ ARC_FLAG_L2CACHE = 1 << 5, /* cache in L2ARC */
ARC_FLAG_L2COMPRESS = 1 << 6, /* compress in L2ARC */ ARC_FLAG_L2COMPRESS = 1 << 6, /* compress in L2ARC */
ARC_FLAG_PREDICTIVE_PREFETCH = 1 << 7, /* I/O from zfetch */
/* /*
* Private ARC flags. These flags are private ARC only flags that * Private ARC flags. These flags are private ARC only flags that
* will show up in b_flags in the arc_hdr_buf_t. These flags should * will show up in b_flags in the arc_hdr_buf_t. These flags should
* only be set by ARC code. * only be set by ARC code.
*/ */
ARC_FLAG_IN_HASH_TABLE = 1 << 7, /* buffer is hashed */ ARC_FLAG_IN_HASH_TABLE = 1 << 8, /* buffer is hashed */
ARC_FLAG_IO_IN_PROGRESS = 1 << 8, /* I/O in progress */ ARC_FLAG_IO_IN_PROGRESS = 1 << 9, /* I/O in progress */
ARC_FLAG_IO_ERROR = 1 << 9, /* I/O failed for buf */ ARC_FLAG_IO_ERROR = 1 << 10, /* I/O failed for buf */
ARC_FLAG_FREED_IN_READ = 1 << 10, /* freed during read */ ARC_FLAG_FREED_IN_READ = 1 << 11, /* freed during read */
ARC_FLAG_BUF_AVAILABLE = 1 << 11, /* block not in use */ ARC_FLAG_BUF_AVAILABLE = 1 << 12, /* block not in use */
ARC_FLAG_INDIRECT = 1 << 12, /* indirect block */ ARC_FLAG_INDIRECT = 1 << 13, /* indirect block */
ARC_FLAG_L2_WRITING = 1 << 13, /* write in progress */ /* Indicates that block was read with ASYNC priority. */
ARC_FLAG_L2_EVICTED = 1 << 14, /* evicted during I/O */ ARC_FLAG_PRIO_ASYNC_READ = 1 << 14,
ARC_FLAG_L2_WRITE_HEAD = 1 << 15, /* head of write list */ ARC_FLAG_L2_WRITING = 1 << 15, /* write in progress */
ARC_FLAG_L2_EVICTED = 1 << 16, /* evicted during I/O */
ARC_FLAG_L2_WRITE_HEAD = 1 << 17, /* head of write list */
/* indicates that the buffer contains metadata (otherwise, data) */ /* indicates that the buffer contains metadata (otherwise, data) */
ARC_FLAG_BUFC_METADATA = 1 << 16, ARC_FLAG_BUFC_METADATA = 1 << 18,
/* Flags specifying whether optional hdr struct fields are defined */ /* Flags specifying whether optional hdr struct fields are defined */
ARC_FLAG_HAS_L1HDR = 1 << 17, ARC_FLAG_HAS_L1HDR = 1 << 19,
ARC_FLAG_HAS_L2HDR = 1 << 18, ARC_FLAG_HAS_L2HDR = 1 << 20,
} arc_flags_t; } arc_flags_t;
struct arc_buf { struct arc_buf {

View File

@ -487,7 +487,8 @@ uint64_t dmu_buf_refcount(dmu_buf_t *db);
* individually with dmu_buf_rele. * individually with dmu_buf_rele.
*/ */
int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset, int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset,
uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp); uint64_t length, boolean_t read, void *tag,
int *numbufsp, dmu_buf_t ***dbpp);
void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag); void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag);
typedef void dmu_buf_evict_func_t(void *user_ptr); typedef void dmu_buf_evict_func_t(void *user_ptr);

View File

@ -23,8 +23,12 @@
* Use is subject to license terms. * Use is subject to license terms.
*/ */
#ifndef _DFETCH_H /*
#define _DFETCH_H * Copyright (c) 2014 by Delphix. All rights reserved.
*/
#ifndef _DMU_ZFETCH_H
#define _DMU_ZFETCH_H
#include <sys/zfs_context.h> #include <sys/zfs_context.h>
@ -36,41 +40,30 @@ extern unsigned long zfetch_array_rd_sz;
struct dnode; /* so we can reference dnode */ struct dnode; /* so we can reference dnode */
typedef enum zfetch_dirn {
ZFETCH_FORWARD = 1, /* prefetch increasing block numbers */
ZFETCH_BACKWARD = -1 /* prefetch decreasing block numbers */
} zfetch_dirn_t;
typedef struct zstream { typedef struct zstream {
uint64_t zst_offset; /* offset of starting block in range */ uint64_t zs_blkid; /* expect next access at this blkid */
uint64_t zst_len; /* length of range, in blocks */ uint64_t zs_pf_blkid; /* next block to prefetch */
zfetch_dirn_t zst_direction; /* direction of prefetch */ kmutex_t zs_lock; /* protects stream */
uint64_t zst_stride; /* length of stride, in blocks */ hrtime_t zs_atime; /* time last prefetch issued */
uint64_t zst_ph_offset; /* prefetch offset, in blocks */ list_node_t zs_node; /* link for zf_stream */
uint64_t zst_cap; /* prefetch limit (cap), in blocks */
kmutex_t zst_lock; /* protects stream */
clock_t zst_last; /* lbolt of last prefetch */
list_node_t zst_node; /* next zstream here */
} zstream_t; } zstream_t;
typedef struct zfetch { typedef struct zfetch {
krwlock_t zf_rwlock; /* protects zfetch structure */ krwlock_t zf_rwlock; /* protects zfetch structure */
list_t zf_stream; /* AVL tree of zstream_t's */ list_t zf_stream; /* list of zstream_t's */
struct dnode *zf_dnode; /* dnode that owns this zfetch */ struct dnode *zf_dnode; /* dnode that owns this zfetch */
uint32_t zf_stream_cnt; /* # of active streams */
uint64_t zf_alloc_fail; /* # of failed attempts to alloc strm */
} zfetch_t; } zfetch_t;
void zfetch_init(void); void zfetch_init(void);
void zfetch_fini(void); void zfetch_fini(void);
void dmu_zfetch_init(zfetch_t *, struct dnode *); void dmu_zfetch_init(zfetch_t *, struct dnode *);
void dmu_zfetch_rele(zfetch_t *); void dmu_zfetch_fini(zfetch_t *);
void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, int); void dmu_zfetch(zfetch_t *, uint64_t, uint64_t);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif
#endif /* _DFETCH_H */ #endif /* _DMU_ZFETCH_H */

View File

@ -102,6 +102,8 @@ DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__evict);
DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__delete); DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__delete);
DEFINE_ARC_BUF_HDR_EVENT(zfs_new_state__mru); DEFINE_ARC_BUF_HDR_EVENT(zfs_new_state__mru);
DEFINE_ARC_BUF_HDR_EVENT(zfs_new_state__mfu); DEFINE_ARC_BUF_HDR_EVENT(zfs_new_state__mfu);
DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__sync__wait__for__async);
DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__demand__hit__predictive__prefetch);
DEFINE_ARC_BUF_HDR_EVENT(zfs_l2arc__hit); DEFINE_ARC_BUF_HDR_EVENT(zfs_l2arc__hit);
DEFINE_ARC_BUF_HDR_EVENT(zfs_l2arc__miss); DEFINE_ARC_BUF_HDR_EVENT(zfs_l2arc__miss);

View File

@ -331,12 +331,12 @@ Default value: \fB1,048,576\fR.
.sp .sp
.ne 2 .ne 2
.na .na
\fBzfetch_block_cap\fR (uint) \fBzfetch_max_distance\fR (uint)
.ad .ad
.RS 12n .RS 12n
Max number of blocks to prefetch at a time Max bytes to prefetch per stream (default 8MB).
.sp .sp
Default value: \fB256\fR. Default value: \fB8,388,608\fR.
.RE .RE
.sp .sp
@ -1246,7 +1246,10 @@ Default value: \fB52,428,800\fR.
\fBzfs_prefetch_disable\fR (int) \fBzfs_prefetch_disable\fR (int)
.ad .ad
.RS 12n .RS 12n
Disable all ZFS prefetching This tunable disables predictive prefetch. Note that it leaves "prescient"
prefetch (e.g. prefetch for zfs send) intact. Unlike predictive prefetch,
prescient prefetch never issues i/os that end up not being needed, so it
can't hurt performance.
.sp .sp
Use \fB1\fR for yes and \fB0\fR for no (default). Use \fB1\fR for yes and \fB0\fR for no (default).
.RE .RE

View File

@ -474,6 +474,8 @@ typedef struct arc_stats {
kstat_named_t arcstat_meta_limit; kstat_named_t arcstat_meta_limit;
kstat_named_t arcstat_meta_max; kstat_named_t arcstat_meta_max;
kstat_named_t arcstat_meta_min; kstat_named_t arcstat_meta_min;
kstat_named_t arcstat_sync_wait_for_async;
kstat_named_t arcstat_demand_hit_predictive_prefetch;
kstat_named_t arcstat_need_free; kstat_named_t arcstat_need_free;
kstat_named_t arcstat_sys_free; kstat_named_t arcstat_sys_free;
} arc_stats_t; } arc_stats_t;
@ -568,6 +570,8 @@ static arc_stats_t arc_stats = {
{ "arc_meta_limit", KSTAT_DATA_UINT64 }, { "arc_meta_limit", KSTAT_DATA_UINT64 },
{ "arc_meta_max", KSTAT_DATA_UINT64 }, { "arc_meta_max", KSTAT_DATA_UINT64 },
{ "arc_meta_min", KSTAT_DATA_UINT64 }, { "arc_meta_min", KSTAT_DATA_UINT64 },
{ "sync_wait_for_async", KSTAT_DATA_UINT64 },
{ "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 },
{ "arc_need_free", KSTAT_DATA_UINT64 }, { "arc_need_free", KSTAT_DATA_UINT64 },
{ "arc_sys_free", KSTAT_DATA_UINT64 } { "arc_sys_free", KSTAT_DATA_UINT64 }
}; };
@ -4244,6 +4248,36 @@ top:
if (HDR_IO_IN_PROGRESS(hdr)) { if (HDR_IO_IN_PROGRESS(hdr)) {
if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) &&
priority == ZIO_PRIORITY_SYNC_READ) {
/*
* This sync read must wait for an
* in-progress async read (e.g. a predictive
* prefetch). Async reads are queued
* separately at the vdev_queue layer, so
* this is a form of priority inversion.
* Ideally, we would "inherit" the demand
* i/o's priority by moving the i/o from
* the async queue to the synchronous queue,
* but there is currently no mechanism to do
* so. Track this so that we can evaluate
* the magnitude of this potential performance
* problem.
*
* Note that if the prefetch i/o is already
* active (has been issued to the device),
* the prefetch improved performance, because
* we issued it sooner than we would have
* without the prefetch.
*/
DTRACE_PROBE1(arc__sync__wait__for__async,
arc_buf_hdr_t *, hdr);
ARCSTAT_BUMP(arcstat_sync_wait_for_async);
}
if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
hdr->b_flags &= ~ARC_FLAG_PREDICTIVE_PREFETCH;
}
if (*arc_flags & ARC_FLAG_WAIT) { if (*arc_flags & ARC_FLAG_WAIT) {
cv_wait(&hdr->b_l1hdr.b_cv, hash_lock); cv_wait(&hdr->b_l1hdr.b_cv, hash_lock);
mutex_exit(hash_lock); mutex_exit(hash_lock);
@ -4252,7 +4286,7 @@ top:
ASSERT(*arc_flags & ARC_FLAG_NOWAIT); ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
if (done) { if (done) {
arc_callback_t *acb = NULL; arc_callback_t *acb = NULL;
acb = kmem_zalloc(sizeof (arc_callback_t), acb = kmem_zalloc(sizeof (arc_callback_t),
KM_SLEEP); KM_SLEEP);
@ -4277,6 +4311,19 @@ top:
hdr->b_l1hdr.b_state == arc_mfu); hdr->b_l1hdr.b_state == arc_mfu);
if (done) { if (done) {
if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
/*
* This is a demand read which does not have to
* wait for i/o because we did a predictive
* prefetch i/o for it, which has completed.
*/
DTRACE_PROBE1(
arc__demand__hit__predictive__prefetch,
arc_buf_hdr_t *, hdr);
ARCSTAT_BUMP(
arcstat_demand_hit_predictive_prefetch);
hdr->b_flags &= ~ARC_FLAG_PREDICTIVE_PREFETCH;
}
add_reference(hdr, hash_lock, private); add_reference(hdr, hash_lock, private);
/* /*
* If this block is already in use, create a new * If this block is already in use, create a new
@ -4349,12 +4396,16 @@ top:
goto top; /* restart the IO request */ goto top; /* restart the IO request */
} }
/* if this is a prefetch, we don't have a reference */ /*
if (*arc_flags & ARC_FLAG_PREFETCH) { * If there is a callback, we pass our reference to
* it; otherwise we remove our reference.
*/
if (done == NULL) {
(void) remove_reference(hdr, hash_lock, (void) remove_reference(hdr, hash_lock,
private); private);
hdr->b_flags |= ARC_FLAG_PREFETCH;
} }
if (*arc_flags & ARC_FLAG_PREFETCH)
hdr->b_flags |= ARC_FLAG_PREFETCH;
if (*arc_flags & ARC_FLAG_L2CACHE) if (*arc_flags & ARC_FLAG_L2CACHE)
hdr->b_flags |= ARC_FLAG_L2CACHE; hdr->b_flags |= ARC_FLAG_L2CACHE;
if (*arc_flags & ARC_FLAG_L2COMPRESS) if (*arc_flags & ARC_FLAG_L2COMPRESS)
@ -4377,11 +4428,13 @@ top:
ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
/* if this is a prefetch, we don't have a reference */ /*
* If there is a callback, we pass a reference to it.
*/
if (done != NULL)
add_reference(hdr, hash_lock, private);
if (*arc_flags & ARC_FLAG_PREFETCH) if (*arc_flags & ARC_FLAG_PREFETCH)
hdr->b_flags |= ARC_FLAG_PREFETCH; hdr->b_flags |= ARC_FLAG_PREFETCH;
else
add_reference(hdr, hash_lock, private);
if (*arc_flags & ARC_FLAG_L2CACHE) if (*arc_flags & ARC_FLAG_L2CACHE)
hdr->b_flags |= ARC_FLAG_L2CACHE; hdr->b_flags |= ARC_FLAG_L2CACHE;
if (*arc_flags & ARC_FLAG_L2COMPRESS) if (*arc_flags & ARC_FLAG_L2COMPRESS)
@ -4399,6 +4452,8 @@ top:
arc_access(hdr, hash_lock); arc_access(hdr, hash_lock);
} }
if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH)
hdr->b_flags |= ARC_FLAG_PREDICTIVE_PREFETCH;
ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state)); ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state));
acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
@ -4438,6 +4493,11 @@ top:
demand, prefetch, !HDR_ISTYPE_METADATA(hdr), demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
data, metadata, misses); data, metadata, misses);
if (priority == ZIO_PRIORITY_ASYNC_READ)
hdr->b_flags |= ARC_FLAG_PRIO_ASYNC_READ;
else
hdr->b_flags &= ~ARC_FLAG_PRIO_ASYNC_READ;
if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
/* /*
* Read from the L2ARC if the following are true: * Read from the L2ARC if the following are true:

View File

@ -676,7 +676,7 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
} }
static int static int
dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
{ {
dnode_t *dn; dnode_t *dn;
zbookmark_phys_t zb; zbookmark_phys_t zb;
@ -723,7 +723,6 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
db->db.db_size, db, type)); db->db.db_size, db, type));
bzero(db->db.db_data, db->db.db_size); bzero(db->db.db_data, db->db.db_size);
db->db_state = DB_CACHED; db->db_state = DB_CACHED;
*flags |= DB_RF_CACHED;
mutex_exit(&db->db_mtx); mutex_exit(&db->db_mtx);
return (0); return (0);
} }
@ -746,10 +745,8 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
err = arc_read(zio, db->db_objset->os_spa, db->db_blkptr, err = arc_read(zio, db->db_objset->os_spa, db->db_blkptr,
dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
(*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
&aflags, &zb); &aflags, &zb);
if (aflags & ARC_FLAG_CACHED)
*flags |= DB_RF_CACHED;
return (SET_ERROR(err)); return (SET_ERROR(err));
} }
@ -784,8 +781,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
if (db->db_state == DB_CACHED) { if (db->db_state == DB_CACHED) {
mutex_exit(&db->db_mtx); mutex_exit(&db->db_mtx);
if (prefetch) if (prefetch)
dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1);
db->db.db_size, TRUE);
if ((flags & DB_RF_HAVESTRUCT) == 0) if ((flags & DB_RF_HAVESTRUCT) == 0)
rw_exit(&dn->dn_struct_rwlock); rw_exit(&dn->dn_struct_rwlock);
DB_DNODE_EXIT(db); DB_DNODE_EXIT(db);
@ -795,13 +791,12 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
if (zio == NULL) if (zio == NULL)
zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
err = dbuf_read_impl(db, zio, &flags); err = dbuf_read_impl(db, zio, flags);
/* dbuf_read_impl has dropped db_mtx for us */ /* dbuf_read_impl has dropped db_mtx for us */
if (!err && prefetch) if (!err && prefetch)
dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1);
db->db.db_size, flags & DB_RF_CACHED);
if ((flags & DB_RF_HAVESTRUCT) == 0) if ((flags & DB_RF_HAVESTRUCT) == 0)
rw_exit(&dn->dn_struct_rwlock); rw_exit(&dn->dn_struct_rwlock);
@ -820,8 +815,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
*/ */
mutex_exit(&db->db_mtx); mutex_exit(&db->db_mtx);
if (prefetch) if (prefetch)
dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1);
db->db.db_size, TRUE);
if ((flags & DB_RF_HAVESTRUCT) == 0) if ((flags & DB_RF_HAVESTRUCT) == 0)
rw_exit(&dn->dn_struct_rwlock); rw_exit(&dn->dn_struct_rwlock);
DB_DNODE_EXIT(db); DB_DNODE_EXIT(db);
@ -2143,6 +2137,9 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
ASSERT(blkid != DMU_BONUS_BLKID); ASSERT(blkid != DMU_BONUS_BLKID);
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
if (blkid > dn->dn_maxblkid)
return;
if (dnode_block_freed(dn, blkid)) if (dnode_block_freed(dn, blkid))
return; return;

View File

@ -20,7 +20,7 @@
*/ */
/* /*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2014 by Delphix. All rights reserved. * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2014, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014, Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2015 by Chunwei Chen. All rights reserved. * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
@ -386,7 +386,7 @@ dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
*/ */
static int static int
dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags) boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
{ {
dmu_buf_t **dbp; dmu_buf_t **dbp;
uint64_t blkid, nblks, i; uint64_t blkid, nblks, i;
@ -396,15 +396,19 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
ASSERT(length <= DMU_MAX_ACCESS); ASSERT(length <= DMU_MAX_ACCESS);
dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT; /*
if (flags & DMU_READ_NO_PREFETCH || length > zfetch_array_rd_sz) * Note: We directly notify the prefetch code of this read, so that
dbuf_flags |= DB_RF_NOPREFETCH; * we can tell it about the multi-block read. dbuf_read() only knows
* about the one block it is accessing.
*/
dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT |
DB_RF_NOPREFETCH;
rw_enter(&dn->dn_struct_rwlock, RW_READER); rw_enter(&dn->dn_struct_rwlock, RW_READER);
if (dn->dn_datablkshift) { if (dn->dn_datablkshift) {
int blkshift = dn->dn_datablkshift; int blkshift = dn->dn_datablkshift;
nblks = (P2ROUNDUP(offset+length, 1ULL<<blkshift) - nblks = (P2ROUNDUP(offset + length, 1ULL << blkshift) -
P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift; P2ALIGN(offset, 1ULL << blkshift)) >> blkshift;
} else { } else {
if (offset + length > dn->dn_datablksz) { if (offset + length > dn->dn_datablksz) {
zfs_panic_recover("zfs: accessing past end of object " zfs_panic_recover("zfs: accessing past end of object "
@ -423,19 +427,24 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
blkid = dbuf_whichblock(dn, 0, offset); blkid = dbuf_whichblock(dn, 0, offset);
for (i = 0; i < nblks; i++) { for (i = 0; i < nblks; i++) {
dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag); dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag);
if (db == NULL) { if (db == NULL) {
rw_exit(&dn->dn_struct_rwlock); rw_exit(&dn->dn_struct_rwlock);
dmu_buf_rele_array(dbp, nblks, tag); dmu_buf_rele_array(dbp, nblks, tag);
zio_nowait(zio); zio_nowait(zio);
return (SET_ERROR(EIO)); return (SET_ERROR(EIO));
} }
/* initiate async i/o */ /* initiate async i/o */
if (read) { if (read)
(void) dbuf_read(db, zio, dbuf_flags); (void) dbuf_read(db, zio, dbuf_flags);
}
dbp[i] = &db->db; dbp[i] = &db->db;
} }
if ((flags & DMU_READ_NO_PREFETCH) == 0 && read &&
length < zfetch_array_rd_sz) {
dmu_zfetch(&dn->dn_zfetch, blkid, nblks);
}
rw_exit(&dn->dn_struct_rwlock); rw_exit(&dn->dn_struct_rwlock);
/* wait for async i/o */ /* wait for async i/o */
@ -489,7 +498,8 @@ dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
int int
dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset, dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset,
uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) uint64_t length, boolean_t read, void *tag, int *numbufsp,
dmu_buf_t ***dbpp)
{ {
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
dnode_t *dn; dnode_t *dn;
@ -537,9 +547,6 @@ dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
uint64_t blkid; uint64_t blkid;
int nblks, err; int nblks, err;
if (zfs_prefetch_disable)
return;
if (len == 0) { /* they're interested in the bonus buffer */ if (len == 0) { /* they're interested in the bonus buffer */
dn = DMU_META_DNODE(os); dn = DMU_META_DNODE(os);

View File

@ -24,7 +24,7 @@
*/ */
/* /*
* Copyright (c) 2013 by Delphix. All rights reserved. * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
*/ */
#include <sys/zfs_context.h> #include <sys/zfs_context.h>
@ -36,209 +36,43 @@
#include <sys/kstat.h> #include <sys/kstat.h>
/* /*
* I'm against tune-ables, but these should probably exist as tweakable globals * This tunable disables predictive prefetch. Note that it leaves "prescient"
* until we can get this working the way we want it to. * prefetch (e.g. prefetch for zfs send) intact. Unlike predictive prefetch,
* prescient prefetch never issues i/os that end up not being needed,
* so it can't hurt performance.
*/ */
int zfs_prefetch_disable = 0; int zfs_prefetch_disable = B_FALSE;
/* max # of streams per zfetch */ /* max # of streams per zfetch */
unsigned int zfetch_max_streams = 8; unsigned int zfetch_max_streams = 8;
/* min time before stream reclaim */ /* min time before stream reclaim */
unsigned int zfetch_min_sec_reap = 2; unsigned int zfetch_min_sec_reap = 2;
/* max number of blocks to fetch at a time */ /* max bytes to prefetch per stream (default 8MB) */
unsigned int zfetch_block_cap = 256; unsigned int zfetch_max_distance = 8 * 1024 * 1024;
/* number of bytes in a array_read at which we stop prefetching (1Mb) */ /* number of bytes in a array_read at which we stop prefetching (1MB) */
unsigned long zfetch_array_rd_sz = 1024 * 1024; unsigned long zfetch_array_rd_sz = 1024 * 1024;
/* forward decls for static routines */
static boolean_t dmu_zfetch_colinear(zfetch_t *, zstream_t *);
static void dmu_zfetch_dofetch(zfetch_t *, zstream_t *);
static uint64_t dmu_zfetch_fetch(dnode_t *, uint64_t, uint64_t);
static uint64_t dmu_zfetch_fetchsz(dnode_t *, uint64_t, uint64_t);
static boolean_t dmu_zfetch_find(zfetch_t *, zstream_t *, int);
static int dmu_zfetch_stream_insert(zfetch_t *, zstream_t *);
static zstream_t *dmu_zfetch_stream_reclaim(zfetch_t *);
static void dmu_zfetch_stream_remove(zfetch_t *, zstream_t *);
static int dmu_zfetch_streams_equal(zstream_t *, zstream_t *);
typedef struct zfetch_stats { typedef struct zfetch_stats {
kstat_named_t zfetchstat_hits; kstat_named_t zfetchstat_hits;
kstat_named_t zfetchstat_misses; kstat_named_t zfetchstat_misses;
kstat_named_t zfetchstat_colinear_hits; kstat_named_t zfetchstat_max_streams;
kstat_named_t zfetchstat_colinear_misses;
kstat_named_t zfetchstat_stride_hits;
kstat_named_t zfetchstat_stride_misses;
kstat_named_t zfetchstat_reclaim_successes;
kstat_named_t zfetchstat_reclaim_failures;
kstat_named_t zfetchstat_stream_resets;
kstat_named_t zfetchstat_stream_noresets;
kstat_named_t zfetchstat_bogus_streams;
} zfetch_stats_t; } zfetch_stats_t;
static zfetch_stats_t zfetch_stats = { static zfetch_stats_t zfetch_stats = {
{ "hits", KSTAT_DATA_UINT64 }, { "hits", KSTAT_DATA_UINT64 },
{ "misses", KSTAT_DATA_UINT64 }, { "misses", KSTAT_DATA_UINT64 },
{ "colinear_hits", KSTAT_DATA_UINT64 }, { "max_streams", KSTAT_DATA_UINT64 },
{ "colinear_misses", KSTAT_DATA_UINT64 },
{ "stride_hits", KSTAT_DATA_UINT64 },
{ "stride_misses", KSTAT_DATA_UINT64 },
{ "reclaim_successes", KSTAT_DATA_UINT64 },
{ "reclaim_failures", KSTAT_DATA_UINT64 },
{ "streams_resets", KSTAT_DATA_UINT64 },
{ "streams_noresets", KSTAT_DATA_UINT64 },
{ "bogus_streams", KSTAT_DATA_UINT64 },
}; };
#define ZFETCHSTAT_INCR(stat, val) \ #define ZFETCHSTAT_BUMP(stat) \
atomic_add_64(&zfetch_stats.stat.value.ui64, (val)); atomic_inc_64(&zfetch_stats.stat.value.ui64);
#define ZFETCHSTAT_BUMP(stat) ZFETCHSTAT_INCR(stat, 1);
kstat_t *zfetch_ksp; kstat_t *zfetch_ksp;
/*
* Given a zfetch structure and a zstream structure, determine whether the
* blocks to be read are part of a co-linear pair of existing prefetch
* streams. If a set is found, coalesce the streams, removing one, and
* configure the prefetch so it looks for a strided access pattern.
*
* In other words: if we find two sequential access streams that are
* the same length and distance N appart, and this read is N from the
* last stream, then we are probably in a strided access pattern. So
* combine the two sequential streams into a single strided stream.
*
* Returns whether co-linear streams were found.
*/
static boolean_t
dmu_zfetch_colinear(zfetch_t *zf, zstream_t *zh)
{
zstream_t *z_walk;
zstream_t *z_comp;
if (! rw_tryenter(&zf->zf_rwlock, RW_WRITER))
return (0);
if (zh == NULL) {
rw_exit(&zf->zf_rwlock);
return (0);
}
for (z_walk = list_head(&zf->zf_stream); z_walk;
z_walk = list_next(&zf->zf_stream, z_walk)) {
for (z_comp = list_next(&zf->zf_stream, z_walk); z_comp;
z_comp = list_next(&zf->zf_stream, z_comp)) {
int64_t diff;
if (z_walk->zst_len != z_walk->zst_stride ||
z_comp->zst_len != z_comp->zst_stride) {
continue;
}
diff = z_comp->zst_offset - z_walk->zst_offset;
if (z_comp->zst_offset + diff == zh->zst_offset) {
z_walk->zst_offset = zh->zst_offset;
z_walk->zst_direction = diff < 0 ?
ZFETCH_BACKWARD : ZFETCH_FORWARD;
z_walk->zst_stride =
diff * z_walk->zst_direction;
z_walk->zst_ph_offset =
zh->zst_offset + z_walk->zst_stride;
dmu_zfetch_stream_remove(zf, z_comp);
mutex_destroy(&z_comp->zst_lock);
kmem_free(z_comp, sizeof (zstream_t));
dmu_zfetch_dofetch(zf, z_walk);
rw_exit(&zf->zf_rwlock);
return (1);
}
diff = z_walk->zst_offset - z_comp->zst_offset;
if (z_walk->zst_offset + diff == zh->zst_offset) {
z_walk->zst_offset = zh->zst_offset;
z_walk->zst_direction = diff < 0 ?
ZFETCH_BACKWARD : ZFETCH_FORWARD;
z_walk->zst_stride =
diff * z_walk->zst_direction;
z_walk->zst_ph_offset =
zh->zst_offset + z_walk->zst_stride;
dmu_zfetch_stream_remove(zf, z_comp);
mutex_destroy(&z_comp->zst_lock);
kmem_free(z_comp, sizeof (zstream_t));
dmu_zfetch_dofetch(zf, z_walk);
rw_exit(&zf->zf_rwlock);
return (1);
}
}
}
rw_exit(&zf->zf_rwlock);
return (0);
}
/*
* Given a zstream_t, determine the bounds of the prefetch. Then call the
* routine that actually prefetches the individual blocks.
*/
static void
dmu_zfetch_dofetch(zfetch_t *zf, zstream_t *zs)
{
uint64_t prefetch_tail;
uint64_t prefetch_limit;
uint64_t prefetch_ofst;
uint64_t prefetch_len;
uint64_t blocks_fetched;
zs->zst_stride = MAX((int64_t)zs->zst_stride, zs->zst_len);
zs->zst_cap = MIN(zfetch_block_cap, 2 * zs->zst_cap);
prefetch_tail = MAX((int64_t)zs->zst_ph_offset,
(int64_t)(zs->zst_offset + zs->zst_stride));
/*
* XXX: use a faster division method?
*/
prefetch_limit = zs->zst_offset + zs->zst_len +
(zs->zst_cap * zs->zst_stride) / zs->zst_len;
while (prefetch_tail < prefetch_limit) {
prefetch_ofst = zs->zst_offset + zs->zst_direction *
(prefetch_tail - zs->zst_offset);
prefetch_len = zs->zst_len;
/*
* Don't prefetch beyond the end of the file, if working
* backwards.
*/
if ((zs->zst_direction == ZFETCH_BACKWARD) &&
(prefetch_ofst > prefetch_tail)) {
prefetch_len += prefetch_ofst;
prefetch_ofst = 0;
}
/* don't prefetch more than we're supposed to */
if (prefetch_len > zs->zst_len)
break;
blocks_fetched = dmu_zfetch_fetch(zf->zf_dnode,
prefetch_ofst, zs->zst_len);
prefetch_tail += zs->zst_stride;
/* stop if we've run out of stuff to prefetch */
if (blocks_fetched < zs->zst_len)
break;
}
zs->zst_ph_offset = prefetch_tail;
zs->zst_last = ddi_get_lbolt();
}
void void
zfetch_init(void) zfetch_init(void)
{ {
zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc", zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc",
KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t), KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t),
KSTAT_FLAG_VIRTUAL); KSTAT_FLAG_VIRTUAL);
@ -266,273 +100,41 @@ zfetch_fini(void)
void void
dmu_zfetch_init(zfetch_t *zf, dnode_t *dno) dmu_zfetch_init(zfetch_t *zf, dnode_t *dno)
{ {
if (zf == NULL) { if (zf == NULL)
return; return;
}
zf->zf_dnode = dno; zf->zf_dnode = dno;
zf->zf_stream_cnt = 0;
zf->zf_alloc_fail = 0;
list_create(&zf->zf_stream, sizeof (zstream_t), list_create(&zf->zf_stream, sizeof (zstream_t),
offsetof(zstream_t, zst_node)); offsetof(zstream_t, zs_node));
rw_init(&zf->zf_rwlock, NULL, RW_DEFAULT, NULL); rw_init(&zf->zf_rwlock, NULL, RW_DEFAULT, NULL);
} }
/* static void
* This function computes the actual size, in blocks, that can be prefetched, dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs)
* and fetches it.
*/
static uint64_t
dmu_zfetch_fetch(dnode_t *dn, uint64_t blkid, uint64_t nblks)
{ {
uint64_t fetchsz; ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
uint64_t i; list_remove(&zf->zf_stream, zs);
mutex_destroy(&zs->zs_lock);
fetchsz = dmu_zfetch_fetchsz(dn, blkid, nblks); kmem_free(zs, sizeof (*zs));
for (i = 0; i < fetchsz; i++) {
dbuf_prefetch(dn, 0, blkid + i, ZIO_PRIORITY_ASYNC_READ,
ARC_FLAG_PREFETCH);
}
return (fetchsz);
} }
/* /*
* this function returns the number of blocks that would be prefetched, based * Clean-up state associated with a zfetch structure (e.g. destroy the
* upon the supplied dnode, blockid, and nblks. This is used so that we can * streams). This doesn't free the zfetch_t itself, that's left to the caller.
* update streams in place, and then prefetch with their old value after the
* fact. This way, we can delay the prefetch, but subsequent accesses to the
* stream won't result in the same data being prefetched multiple times.
*/
static uint64_t
dmu_zfetch_fetchsz(dnode_t *dn, uint64_t blkid, uint64_t nblks)
{
uint64_t fetchsz;
if (blkid > dn->dn_maxblkid) {
return (0);
}
/* compute fetch size */
if (blkid + nblks + 1 > dn->dn_maxblkid) {
fetchsz = (dn->dn_maxblkid - blkid) + 1;
ASSERT(blkid + fetchsz - 1 <= dn->dn_maxblkid);
} else {
fetchsz = nblks;
}
return (fetchsz);
}
/*
* given a zfetch and a zstream structure, see if there is an associated zstream
* for this block read. If so, it starts a prefetch for the stream it
* located and returns true, otherwise it returns false
*/
static boolean_t
dmu_zfetch_find(zfetch_t *zf, zstream_t *zh, int prefetched)
{
zstream_t *zs;
int64_t diff;
int reset = !prefetched;
int rc = 0;
if (zh == NULL)
return (0);
/*
* XXX: This locking strategy is a bit coarse; however, it's impact has
* yet to be tested. If this turns out to be an issue, it can be
* modified in a number of different ways.
*/
rw_enter(&zf->zf_rwlock, RW_READER);
top:
for (zs = list_head(&zf->zf_stream); zs;
zs = list_next(&zf->zf_stream, zs)) {
/*
* XXX - should this be an assert?
*/
if (zs->zst_len == 0) {
/* bogus stream */
ZFETCHSTAT_BUMP(zfetchstat_bogus_streams);
continue;
}
/*
* We hit this case when we are in a strided prefetch stream:
* we will read "len" blocks before "striding".
*/
if (zh->zst_offset >= zs->zst_offset &&
zh->zst_offset < zs->zst_offset + zs->zst_len) {
if (prefetched) {
/* already fetched */
ZFETCHSTAT_BUMP(zfetchstat_stride_hits);
rc = 1;
goto out;
} else {
ZFETCHSTAT_BUMP(zfetchstat_stride_misses);
}
}
/*
* This is the forward sequential read case: we increment
* len by one each time we hit here, so we will enter this
* case on every read.
*/
if (zh->zst_offset == zs->zst_offset + zs->zst_len) {
reset = !prefetched && zs->zst_len > 1;
mutex_enter(&zs->zst_lock);
if (zh->zst_offset != zs->zst_offset + zs->zst_len) {
mutex_exit(&zs->zst_lock);
goto top;
}
zs->zst_len += zh->zst_len;
diff = zs->zst_len - zfetch_block_cap;
if (diff > 0) {
zs->zst_offset += diff;
zs->zst_len = zs->zst_len > diff ?
zs->zst_len - diff : 0;
}
zs->zst_direction = ZFETCH_FORWARD;
break;
/*
* Same as above, but reading backwards through the file.
*/
} else if (zh->zst_offset == zs->zst_offset - zh->zst_len) {
/* backwards sequential access */
reset = !prefetched && zs->zst_len > 1;
mutex_enter(&zs->zst_lock);
if (zh->zst_offset != zs->zst_offset - zh->zst_len) {
mutex_exit(&zs->zst_lock);
goto top;
}
zs->zst_offset = zs->zst_offset > zh->zst_len ?
zs->zst_offset - zh->zst_len : 0;
zs->zst_ph_offset = zs->zst_ph_offset > zh->zst_len ?
zs->zst_ph_offset - zh->zst_len : 0;
zs->zst_len += zh->zst_len;
diff = zs->zst_len - zfetch_block_cap;
if (diff > 0) {
zs->zst_ph_offset = zs->zst_ph_offset > diff ?
zs->zst_ph_offset - diff : 0;
zs->zst_len = zs->zst_len > diff ?
zs->zst_len - diff : zs->zst_len;
}
zs->zst_direction = ZFETCH_BACKWARD;
break;
} else if ((zh->zst_offset - zs->zst_offset - zs->zst_stride <
zs->zst_len) && (zs->zst_len != zs->zst_stride)) {
/* strided forward access */
mutex_enter(&zs->zst_lock);
if ((zh->zst_offset - zs->zst_offset - zs->zst_stride >=
zs->zst_len) || (zs->zst_len == zs->zst_stride)) {
mutex_exit(&zs->zst_lock);
goto top;
}
zs->zst_offset += zs->zst_stride;
zs->zst_direction = ZFETCH_FORWARD;
break;
} else if ((zh->zst_offset - zs->zst_offset + zs->zst_stride <
zs->zst_len) && (zs->zst_len != zs->zst_stride)) {
/* strided reverse access */
mutex_enter(&zs->zst_lock);
if ((zh->zst_offset - zs->zst_offset + zs->zst_stride >=
zs->zst_len) || (zs->zst_len == zs->zst_stride)) {
mutex_exit(&zs->zst_lock);
goto top;
}
zs->zst_offset = zs->zst_offset > zs->zst_stride ?
zs->zst_offset - zs->zst_stride : 0;
zs->zst_ph_offset = (zs->zst_ph_offset >
(2 * zs->zst_stride)) ?
(zs->zst_ph_offset - (2 * zs->zst_stride)) : 0;
zs->zst_direction = ZFETCH_BACKWARD;
break;
}
}
if (zs) {
if (reset) {
zstream_t *remove = zs;
ZFETCHSTAT_BUMP(zfetchstat_stream_resets);
rc = 0;
mutex_exit(&zs->zst_lock);
rw_exit(&zf->zf_rwlock);
rw_enter(&zf->zf_rwlock, RW_WRITER);
/*
* Relocate the stream, in case someone removes
* it while we were acquiring the WRITER lock.
*/
for (zs = list_head(&zf->zf_stream); zs;
zs = list_next(&zf->zf_stream, zs)) {
if (zs == remove) {
dmu_zfetch_stream_remove(zf, zs);
mutex_destroy(&zs->zst_lock);
kmem_free(zs, sizeof (zstream_t));
break;
}
}
} else {
ZFETCHSTAT_BUMP(zfetchstat_stream_noresets);
rc = 1;
dmu_zfetch_dofetch(zf, zs);
mutex_exit(&zs->zst_lock);
}
}
out:
rw_exit(&zf->zf_rwlock);
return (rc);
}
/*
* Clean-up state associated with a zfetch structure. This frees allocated
* structure members, empties the zf_stream tree, and generally makes things
* nice. This doesn't free the zfetch_t itself, that's left to the caller.
*/ */
void void
dmu_zfetch_rele(zfetch_t *zf) dmu_zfetch_fini(zfetch_t *zf)
{ {
zstream_t *zs; zstream_t *zs;
zstream_t *zs_next;
ASSERT(!RW_LOCK_HELD(&zf->zf_rwlock)); ASSERT(!RW_LOCK_HELD(&zf->zf_rwlock));
for (zs = list_head(&zf->zf_stream); zs; zs = zs_next) { rw_enter(&zf->zf_rwlock, RW_WRITER);
zs_next = list_next(&zf->zf_stream, zs); while ((zs = list_head(&zf->zf_stream)) != NULL)
dmu_zfetch_stream_remove(zf, zs);
list_remove(&zf->zf_stream, zs); rw_exit(&zf->zf_rwlock);
mutex_destroy(&zs->zst_lock);
kmem_free(zs, sizeof (zstream_t));
}
list_destroy(&zf->zf_stream); list_destroy(&zf->zf_stream);
rw_destroy(&zf->zf_rwlock); rw_destroy(&zf->zf_rwlock);
@ -540,101 +142,57 @@ dmu_zfetch_rele(zfetch_t *zf)
} }
/* /*
* Given a zfetch and zstream structure, insert the zstream structure into the * If there aren't too many streams already, create a new stream.
* AVL tree contained within the zfetch structure. Peform the appropriate * The "blkid" argument is the next block that we expect this stream to access.
* book-keeping. It is possible that another thread has inserted a stream which * While we're here, clean up old streams (which haven't been
* matches one that we are about to insert, so we must be sure to check for this * accessed for at least zfetch_min_sec_reap seconds).
* case. If one is found, return failure, and let the caller cleanup the
* duplicates.
*/
static int
dmu_zfetch_stream_insert(zfetch_t *zf, zstream_t *zs)
{
zstream_t *zs_walk;
zstream_t *zs_next;
ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
for (zs_walk = list_head(&zf->zf_stream); zs_walk; zs_walk = zs_next) {
zs_next = list_next(&zf->zf_stream, zs_walk);
if (dmu_zfetch_streams_equal(zs_walk, zs)) {
return (0);
}
}
list_insert_head(&zf->zf_stream, zs);
zf->zf_stream_cnt++;
return (1);
}
/*
* Walk the list of zstreams in the given zfetch, find an old one (by time), and
* reclaim it for use by the caller.
*/
static zstream_t *
dmu_zfetch_stream_reclaim(zfetch_t *zf)
{
zstream_t *zs;
if (! rw_tryenter(&zf->zf_rwlock, RW_WRITER))
return (0);
for (zs = list_head(&zf->zf_stream); zs;
zs = list_next(&zf->zf_stream, zs)) {
if (((ddi_get_lbolt() - zs->zst_last)/hz) > zfetch_min_sec_reap)
break;
}
if (zs) {
dmu_zfetch_stream_remove(zf, zs);
mutex_destroy(&zs->zst_lock);
bzero(zs, sizeof (zstream_t));
} else {
zf->zf_alloc_fail++;
}
rw_exit(&zf->zf_rwlock);
return (zs);
}
/*
* Given a zfetch and zstream structure, remove the zstream structure from its
* container in the zfetch structure. Perform the appropriate book-keeping.
*/ */
static void static void
dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs) dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
{ {
zstream_t *zs;
zstream_t *zs_next;
int numstreams = 0;
uint32_t max_streams;
ASSERT(RW_WRITE_HELD(&zf->zf_rwlock)); ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
list_remove(&zf->zf_stream, zs); /*
zf->zf_stream_cnt--; * Clean up old streams.
} */
for (zs = list_head(&zf->zf_stream);
zs != NULL; zs = zs_next) {
zs_next = list_next(&zf->zf_stream, zs);
if (((gethrtime() - zs->zs_atime) / NANOSEC) >
zfetch_min_sec_reap)
dmu_zfetch_stream_remove(zf, zs);
else
numstreams++;
}
static int /*
dmu_zfetch_streams_equal(zstream_t *zs1, zstream_t *zs2) * The maximum number of streams is normally zfetch_max_streams,
{ * but for small files we lower it such that it's at least possible
if (zs1->zst_offset != zs2->zst_offset) * for all the streams to be non-overlapping.
return (0); *
* If we are already at the maximum number of streams for this file,
* even after removing old streams, then don't create this stream.
*/
max_streams = MAX(1, MIN(zfetch_max_streams,
zf->zf_dnode->dn_maxblkid * zf->zf_dnode->dn_datablksz /
zfetch_max_distance));
if (numstreams >= max_streams) {
ZFETCHSTAT_BUMP(zfetchstat_max_streams);
return;
}
if (zs1->zst_len != zs2->zst_len) zs = kmem_zalloc(sizeof (*zs), KM_SLEEP);
return (0); zs->zs_blkid = blkid;
zs->zs_pf_blkid = blkid;
zs->zs_atime = gethrtime();
mutex_init(&zs->zs_lock, NULL, MUTEX_DEFAULT, NULL);
if (zs1->zst_stride != zs2->zst_stride) list_insert_head(&zf->zf_stream, zs);
return (0);
if (zs1->zst_ph_offset != zs2->zst_ph_offset)
return (0);
if (zs1->zst_cap != zs2->zst_cap)
return (0);
if (zs1->zst_direction != zs2->zst_direction)
return (0);
return (1);
} }
/* /*
@ -642,93 +200,91 @@ dmu_zfetch_streams_equal(zstream_t *zs1, zstream_t *zs2)
* routines to create, delete, find, or operate upon prefetch streams. * routines to create, delete, find, or operate upon prefetch streams.
*/ */
void void
dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched) dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks)
{ {
zstream_t zst; zstream_t *zs;
zstream_t *newstream; int64_t pf_start;
boolean_t fetched; int pf_nblks;
int inserted; int i;
unsigned int blkshft;
uint64_t blksz;
if (zfs_prefetch_disable) if (zfs_prefetch_disable)
return; return;
/* files that aren't ln2 blocksz are only one block -- nothing to do */ /*
if (!zf->zf_dnode->dn_datablkshift) * As a fast path for small (single-block) files, ignore access
* to the first block.
*/
if (blkid == 0)
return; return;
/* convert offset and size, into blockid and nblocks */ rw_enter(&zf->zf_rwlock, RW_READER);
blkshft = zf->zf_dnode->dn_datablkshift;
blksz = (1 << blkshft);
bzero(&zst, sizeof (zstream_t)); for (zs = list_head(&zf->zf_stream); zs != NULL;
zst.zst_offset = offset >> blkshft; zs = list_next(&zf->zf_stream, zs)) {
zst.zst_len = (P2ROUNDUP(offset + size, blksz) - if (blkid == zs->zs_blkid) {
P2ALIGN(offset, blksz)) >> blkshft; mutex_enter(&zs->zs_lock);
/*
fetched = dmu_zfetch_find(zf, &zst, prefetched); * zs_blkid could have changed before we
if (fetched) { * acquired zs_lock; re-check them here.
ZFETCHSTAT_BUMP(zfetchstat_hits); */
} else { if (blkid != zs->zs_blkid) {
ZFETCHSTAT_BUMP(zfetchstat_misses); mutex_exit(&zs->zs_lock);
if ((fetched = dmu_zfetch_colinear(zf, &zst))) { continue;
ZFETCHSTAT_BUMP(zfetchstat_colinear_hits); }
} else { break;
ZFETCHSTAT_BUMP(zfetchstat_colinear_misses);
} }
} }
if (!fetched) { if (zs == NULL) {
newstream = dmu_zfetch_stream_reclaim(zf);
/* /*
* we still couldn't find a stream, drop the lock, and allocate * This access is not part of any existing stream. Create
* one if possible. Otherwise, give up and go home. * a new stream for it.
*/ */
if (newstream) { ZFETCHSTAT_BUMP(zfetchstat_misses);
ZFETCHSTAT_BUMP(zfetchstat_reclaim_successes); if (rw_tryupgrade(&zf->zf_rwlock))
} else { dmu_zfetch_stream_create(zf, blkid + nblks);
uint64_t maxblocks;
uint32_t max_streams;
uint32_t cur_streams;
ZFETCHSTAT_BUMP(zfetchstat_reclaim_failures);
cur_streams = zf->zf_stream_cnt;
maxblocks = zf->zf_dnode->dn_maxblkid;
max_streams = MIN(zfetch_max_streams,
(maxblocks / zfetch_block_cap));
if (max_streams == 0) {
max_streams++;
}
if (cur_streams >= max_streams) {
return;
}
newstream =
kmem_zalloc(sizeof (zstream_t), KM_SLEEP);
}
newstream->zst_offset = zst.zst_offset;
newstream->zst_len = zst.zst_len;
newstream->zst_stride = zst.zst_len;
newstream->zst_ph_offset = zst.zst_len + zst.zst_offset;
newstream->zst_cap = zst.zst_len;
newstream->zst_direction = ZFETCH_FORWARD;
newstream->zst_last = ddi_get_lbolt();
mutex_init(&newstream->zst_lock, NULL, MUTEX_DEFAULT, NULL);
rw_enter(&zf->zf_rwlock, RW_WRITER);
inserted = dmu_zfetch_stream_insert(zf, newstream);
rw_exit(&zf->zf_rwlock); rw_exit(&zf->zf_rwlock);
return;
if (!inserted) {
mutex_destroy(&newstream->zst_lock);
kmem_free(newstream, sizeof (zstream_t));
}
} }
/*
* This access was to a block that we issued a prefetch for on
* behalf of this stream. Issue further prefetches for this stream.
*
* Normally, we start prefetching where we stopped
* prefetching last (zs_pf_blkid). But when we get our first
* hit on this stream, zs_pf_blkid == zs_blkid, we don't
* want to prefetch to block we just accessed. In this case,
* start just after the block we just accessed.
*/
pf_start = MAX(zs->zs_pf_blkid, blkid + nblks);
/*
* Double our amount of prefetched data, but don't let the
* prefetch get further ahead than zfetch_max_distance.
*/
pf_nblks =
MIN((int64_t)zs->zs_pf_blkid - zs->zs_blkid + nblks,
zs->zs_blkid + nblks +
(zfetch_max_distance >> zf->zf_dnode->dn_datablkshift) - pf_start);
zs->zs_pf_blkid = pf_start + pf_nblks;
zs->zs_atime = gethrtime();
zs->zs_blkid = blkid + nblks;
/*
* dbuf_prefetch() issues the prefetch i/o
* asynchronously, but it may need to wait for an
* indirect block to be read from disk. Therefore
* we do not want to hold any locks while we call it.
*/
mutex_exit(&zs->zs_lock);
rw_exit(&zf->zf_rwlock);
for (i = 0; i < pf_nblks; i++) {
dbuf_prefetch(zf->zf_dnode, 0, pf_start + i,
ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH);
}
ZFETCHSTAT_BUMP(zfetchstat_hits);
} }
#if defined(_KERNEL) && defined(HAVE_SPL) #if defined(_KERNEL) && defined(HAVE_SPL)
@ -741,8 +297,9 @@ MODULE_PARM_DESC(zfetch_max_streams, "Max number of streams per zfetch");
module_param(zfetch_min_sec_reap, uint, 0644); module_param(zfetch_min_sec_reap, uint, 0644);
MODULE_PARM_DESC(zfetch_min_sec_reap, "Min time before stream reclaim"); MODULE_PARM_DESC(zfetch_min_sec_reap, "Min time before stream reclaim");
module_param(zfetch_block_cap, uint, 0644); module_param(zfetch_max_distance, uint, 0644);
MODULE_PARM_DESC(zfetch_block_cap, "Max number of blocks to fetch at a time"); MODULE_PARM_DESC(zfetch_max_distance,
"Max bytes to prefetch per stream (default 8MB)");
module_param(zfetch_array_rd_sz, ulong, 0644); module_param(zfetch_array_rd_sz, ulong, 0644);
MODULE_PARM_DESC(zfetch_array_rd_sz, "Number of bytes in a array_read"); MODULE_PARM_DESC(zfetch_array_rd_sz, "Number of bytes in a array_read");

View File

@ -524,7 +524,7 @@ dnode_destroy(dnode_t *dn)
dn->dn_id_flags = 0; dn->dn_id_flags = 0;
dn->dn_unlisted_l0_blkid = 0; dn->dn_unlisted_l0_blkid = 0;
dmu_zfetch_rele(&dn->dn_zfetch); dmu_zfetch_fini(&dn->dn_zfetch);
kmem_cache_free(dnode_cache, dn); kmem_cache_free(dnode_cache, dn);
arc_space_return(sizeof (dnode_t), ARC_SPACE_OTHER); arc_space_return(sizeof (dnode_t), ARC_SPACE_OTHER);
@ -773,8 +773,6 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
dmu_zfetch_init(&ndn->dn_zfetch, NULL); dmu_zfetch_init(&ndn->dn_zfetch, NULL);
list_move_tail(&ndn->dn_zfetch.zf_stream, &odn->dn_zfetch.zf_stream); list_move_tail(&ndn->dn_zfetch.zf_stream, &odn->dn_zfetch.zf_stream);
ndn->dn_zfetch.zf_dnode = odn->dn_zfetch.zf_dnode; ndn->dn_zfetch.zf_dnode = odn->dn_zfetch.zf_dnode;
ndn->dn_zfetch.zf_stream_cnt = odn->dn_zfetch.zf_stream_cnt;
ndn->dn_zfetch.zf_alloc_fail = odn->dn_zfetch.zf_alloc_fail;
/* /*
* Update back pointers. Updating the handle fixes the back pointer of * Update back pointers. Updating the handle fixes the back pointer of