diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index f1a8ff61d7..642cab5f2f 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -4042,7 +4042,7 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) * assign an arcbuf to a dbuf. */ for (j = 0; j < s; j++) { - if (i != 5) { + if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { bigbuf_arcbufs[j] = dmu_request_arcbuf(bonus_db, chunksize); } else { @@ -4066,7 +4066,8 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) umem_free(packbuf, packsize); umem_free(bigbuf, bigsize); for (j = 0; j < s; j++) { - if (i != 5) { + if (i != 5 || + chunksize < (SPA_MINBLOCKSIZE * 2)) { dmu_return_arcbuf(bigbuf_arcbufs[j]); } else { dmu_return_arcbuf( @@ -4111,7 +4112,7 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) } for (off = bigoff, j = 0; j < s; j++, off += chunksize) { dmu_buf_t *dbt; - if (i != 5) { + if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { bcopy((caddr_t)bigbuf + (off - bigoff), bigbuf_arcbufs[j]->b_data, chunksize); } else { @@ -4128,7 +4129,7 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) VERIFY(dmu_buf_hold(os, bigobj, off, FTAG, &dbt, DMU_READ_NO_PREFETCH) == 0); } - if (i != 5) { + if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { dmu_assign_arcbuf(bonus_db, off, bigbuf_arcbufs[j], tx); } else { diff --git a/include/sys/arc.h b/include/sys/arc.h index 25e2f03525..903f0b4131 100644 --- a/include/sys/arc.h +++ b/include/sys/arc.h @@ -81,10 +81,29 @@ typedef enum arc_flags ARC_FLAG_FREED_IN_READ = 1 << 10, /* freed during read */ ARC_FLAG_BUF_AVAILABLE = 1 << 11, /* block not in use */ ARC_FLAG_INDIRECT = 1 << 12, /* indirect block */ - ARC_FLAG_FREE_IN_PROGRESS = 1 << 13, /* about to be freed */ - ARC_FLAG_L2_WRITING = 1 << 14, /* write in progress */ - ARC_FLAG_L2_EVICTED = 1 << 15, /* evicted during I/O */ - ARC_FLAG_L2_WRITE_HEAD = 1 << 16, /* head of write list */ + ARC_FLAG_L2_WRITING = 1 << 13, /* write in progress */ + ARC_FLAG_L2_EVICTED = 1 << 14, /* evicted during I/O */ + ARC_FLAG_L2_WRITE_HEAD = 1 << 15, /* head of write list */ + /* indicates that the buffer contains metadata (otherwise, data) */ + ARC_FLAG_BUFC_METADATA = 1 << 16, + + /* Flags specifying whether optional hdr struct fields are defined */ + ARC_FLAG_HAS_L1HDR = 1 << 17, + ARC_FLAG_HAS_L2HDR = 1 << 18, + + /* + * The arc buffer's compression mode is stored in the top 7 bits of the + * flags field, so these dummy flags are included so that MDB can + * interpret the enum properly. + */ + ARC_FLAG_COMPRESS_0 = 1 << 24, + ARC_FLAG_COMPRESS_1 = 1 << 25, + ARC_FLAG_COMPRESS_2 = 1 << 26, + ARC_FLAG_COMPRESS_3 = 1 << 27, + ARC_FLAG_COMPRESS_4 = 1 << 28, + ARC_FLAG_COMPRESS_5 = 1 << 29, + ARC_FLAG_COMPRESS_6 = 1 << 30 + } arc_flags_t; struct arc_buf { diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h index 1f8351a678..556cc25833 100644 --- a/include/sys/arc_impl.h +++ b/include/sys/arc_impl.h @@ -74,8 +74,6 @@ typedef struct arc_state { arc_state_type_t arcs_state; } arc_state_t; -typedef struct l2arc_buf_hdr l2arc_buf_hdr_t; - typedef struct arc_callback arc_callback_t; struct arc_callback { @@ -96,27 +94,45 @@ struct arc_write_callback { arc_buf_t *awcb_buf; }; -struct arc_buf_hdr { - /* protected by hash lock */ - dva_t b_dva; - uint64_t b_birth; - uint64_t b_cksum0; - +/* + * ARC buffers are separated into multiple structs as a memory saving measure: + * - Common fields struct, always defined, and embedded within it: + * - L2-only fields, always allocated but undefined when not in L2ARC + * - L1-only fields, only allocated when in L1ARC + * + * Buffer in L1 Buffer only in L2 + * +------------------------+ +------------------------+ + * | arc_buf_hdr_t | | arc_buf_hdr_t | + * | | | | + * | | | | + * | | | | + * +------------------------+ +------------------------+ + * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t | + * | (undefined if L1-only) | | | + * +------------------------+ +------------------------+ + * | l1arc_buf_hdr_t | + * | | + * | | + * | | + * | | + * +------------------------+ + * + * Because it's possible for the L2ARC to become extremely large, we can wind + * up eating a lot of memory in L2ARC buffer headers, so the size of a header + * is minimized by only allocating the fields necessary for an L1-cached buffer + * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and + * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple + * words in pointers. arc_hdr_realloc() is used to switch a header between + * these two allocation states. + */ +typedef struct l1arc_buf_hdr { kmutex_t b_freeze_lock; - zio_cksum_t *b_freeze_cksum; - arc_buf_hdr_t *b_hash_next; arc_buf_t *b_buf; - arc_flags_t b_flags; uint32_t b_datacnt; - - arc_callback_t *b_acb; + /* for waiting on writes to complete */ kcondvar_t b_cv; - /* immutable */ - arc_buf_contents_t b_type; - uint64_t b_size; - uint64_t b_spa; /* protected by arc state mutex */ arc_state_t *b_state; @@ -133,9 +149,10 @@ struct arc_buf_hdr { /* self protecting */ refcount_t b_refcnt; - l2arc_buf_hdr_t *b_l2hdr; - list_node_t b_l2node; -}; + arc_callback_t *b_acb; + /* temporary buffer holder for in-flight compressed data */ + void *b_tmp_cdata; +} l1arc_buf_hdr_t; typedef struct l2arc_dev { vdev_t *l2ad_vdev; /* vdev */ @@ -146,15 +163,51 @@ typedef struct l2arc_dev { uint64_t l2ad_evict; /* last addr eviction reached */ boolean_t l2ad_first; /* first sweep through */ boolean_t l2ad_writing; /* currently writing */ - list_t *l2ad_buflist; /* buffer list */ + kmutex_t l2ad_mtx; /* lock for buffer list */ + list_t l2ad_buflist; /* buffer list */ list_node_t l2ad_node; /* device list node */ } l2arc_dev_t; +typedef struct l2arc_buf_hdr { + /* protected by arc_buf_hdr mutex */ + l2arc_dev_t *b_dev; /* L2ARC device */ + uint64_t b_daddr; /* disk address, offset byte */ + /* real alloc'd buffer size depending on b_compress applied */ + uint32_t b_hits; + int32_t b_asize; + + list_node_t b_l2node; +} l2arc_buf_hdr_t; + typedef struct l2arc_write_callback { l2arc_dev_t *l2wcb_dev; /* device info */ arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ } l2arc_write_callback_t; +struct arc_buf_hdr { + /* protected by hash lock */ + dva_t b_dva; + uint64_t b_birth; + /* + * Even though this checksum is only set/verified when a buffer is in + * the L1 cache, it needs to be in the set of common fields because it + * must be preserved from the time before a buffer is written out to + * L2ARC until after it is read back in. + */ + zio_cksum_t *b_freeze_cksum; + + arc_buf_hdr_t *b_hash_next; + arc_flags_t b_flags; + + /* immutable */ + int32_t b_size; + uint64_t b_spa; + + /* L2ARC fields. Undefined when not in L2ARC. */ + l2arc_buf_hdr_t b_l2hdr; + /* L1ARC fields. Undefined when in l2arc_only state */ + l1arc_buf_hdr_t b_l1hdr; +}; #ifdef __cplusplus } #endif diff --git a/include/sys/trace_arc.h b/include/sys/trace_arc.h index 8b885eff73..b9df228eae 100644 --- a/include/sys/trace_arc.h +++ b/include/sys/trace_arc.h @@ -45,7 +45,6 @@ DECLARE_EVENT_CLASS(zfs_arc_buf_hdr_class, TP_STRUCT__entry( __array(uint64_t, hdr_dva_word, 2) __field(uint64_t, hdr_birth) - __field(uint64_t, hdr_cksum0) __field(uint32_t, hdr_flags) __field(uint32_t, hdr_datacnt) __field(arc_buf_contents_t, hdr_type) @@ -64,27 +63,25 @@ DECLARE_EVENT_CLASS(zfs_arc_buf_hdr_class, __entry->hdr_dva_word[0] = ab->b_dva.dva_word[0]; __entry->hdr_dva_word[1] = ab->b_dva.dva_word[1]; __entry->hdr_birth = ab->b_birth; - __entry->hdr_cksum0 = ab->b_cksum0; __entry->hdr_flags = ab->b_flags; - __entry->hdr_datacnt = ab->b_datacnt; - __entry->hdr_type = ab->b_type; + __entry->hdr_datacnt = ab->b_l1hdr.b_datacnt; __entry->hdr_size = ab->b_size; __entry->hdr_spa = ab->b_spa; - __entry->hdr_state_type = ab->b_state->arcs_state; - __entry->hdr_access = ab->b_arc_access; - __entry->hdr_mru_hits = ab->b_mru_hits; - __entry->hdr_mru_ghost_hits = ab->b_mru_ghost_hits; - __entry->hdr_mfu_hits = ab->b_mfu_hits; - __entry->hdr_mfu_ghost_hits = ab->b_mfu_ghost_hits; - __entry->hdr_l2_hits = ab->b_l2_hits; - __entry->hdr_refcount = ab->b_refcnt.rc_count; + __entry->hdr_state_type = ab->b_l1hdr.b_state->arcs_state; + __entry->hdr_access = ab->b_l1hdr.b_arc_access; + __entry->hdr_mru_hits = ab->b_l1hdr.b_mru_hits; + __entry->hdr_mru_ghost_hits = ab->b_l1hdr.b_mru_ghost_hits; + __entry->hdr_mfu_hits = ab->b_l1hdr.b_mfu_hits; + __entry->hdr_mfu_ghost_hits = ab->b_l1hdr.b_mfu_ghost_hits; + __entry->hdr_l2_hits = ab->b_l1hdr.b_l2_hits; + __entry->hdr_refcount = ab->b_l1hdr.b_refcnt.rc_count; ), - TP_printk("hdr { dva 0x%llx:0x%llx birth %llu cksum0 0x%llx " + TP_printk("hdr { dva 0x%llx:0x%llx birth %llu " "flags 0x%x datacnt %u type %u size %llu spa %llu " "state_type %u access %lu mru_hits %u mru_ghost_hits %u " "mfu_hits %u mfu_ghost_hits %u l2_hits %u refcount %lli }", __entry->hdr_dva_word[0], __entry->hdr_dva_word[1], - __entry->hdr_birth, __entry->hdr_cksum0, __entry->hdr_flags, + __entry->hdr_birth, __entry->hdr_flags, __entry->hdr_datacnt, __entry->hdr_type, __entry->hdr_size, __entry->hdr_spa, __entry->hdr_state_type, __entry->hdr_access, __entry->hdr_mru_hits, @@ -261,7 +258,6 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class, TP_STRUCT__entry( __array(uint64_t, hdr_dva_word, 2) __field(uint64_t, hdr_birth) - __field(uint64_t, hdr_cksum0) __field(uint32_t, hdr_flags) __field(uint32_t, hdr_datacnt) __field(arc_buf_contents_t, hdr_type) @@ -292,20 +288,18 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class, __entry->hdr_dva_word[0] = hdr->b_dva.dva_word[0]; __entry->hdr_dva_word[1] = hdr->b_dva.dva_word[1]; __entry->hdr_birth = hdr->b_birth; - __entry->hdr_cksum0 = hdr->b_cksum0; __entry->hdr_flags = hdr->b_flags; - __entry->hdr_datacnt = hdr->b_datacnt; - __entry->hdr_type = hdr->b_type; + __entry->hdr_datacnt = hdr->b_l1hdr.b_datacnt; __entry->hdr_size = hdr->b_size; __entry->hdr_spa = hdr->b_spa; - __entry->hdr_state_type = hdr->b_state->arcs_state; - __entry->hdr_access = hdr->b_arc_access; - __entry->hdr_mru_hits = hdr->b_mru_hits; - __entry->hdr_mru_ghost_hits = hdr->b_mru_ghost_hits; - __entry->hdr_mfu_hits = hdr->b_mfu_hits; - __entry->hdr_mfu_ghost_hits = hdr->b_mfu_ghost_hits; - __entry->hdr_l2_hits = hdr->b_l2_hits; - __entry->hdr_refcount = hdr->b_refcnt.rc_count; + __entry->hdr_state_type = hdr->b_l1hdr.b_state->arcs_state; + __entry->hdr_access = hdr->b_l1hdr.b_arc_access; + __entry->hdr_mru_hits = hdr->b_l1hdr.b_mru_hits; + __entry->hdr_mru_ghost_hits = hdr->b_l1hdr.b_mru_ghost_hits; + __entry->hdr_mfu_hits = hdr->b_l1hdr.b_mfu_hits; + __entry->hdr_mfu_ghost_hits = hdr->b_l1hdr.b_mfu_ghost_hits; + __entry->hdr_l2_hits = hdr->b_l1hdr.b_l2_hits; + __entry->hdr_refcount = hdr->b_l1hdr.b_refcnt.rc_count; __entry->bp_dva0[0] = bp->blk_dva[0].dva_word[0]; __entry->bp_dva0[1] = bp->blk_dva[0].dva_word[1]; @@ -325,8 +319,8 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class, __entry->zb_level = zb->zb_level; __entry->zb_blkid = zb->zb_blkid; ), - TP_printk("hdr { dva 0x%llx:0x%llx birth %llu cksum0 0x%llx " - "flags 0x%x datacnt %u type %u size %llu spa %llu state_type %u " + TP_printk("hdr { dva 0x%llx:0x%llx birth %llu " + "flags 0x%x datacnt %u size %llu spa %llu state_type %u " "access %lu mru_hits %u mru_ghost_hits %u mfu_hits %u " "mfu_ghost_hits %u l2_hits %u refcount %lli } " "bp { dva0 0x%llx:0x%llx dva1 0x%llx:0x%llx dva2 " @@ -334,8 +328,8 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class, "lsize %llu } zb { objset %llu object %llu level %lli " "blkid %llu }", __entry->hdr_dva_word[0], __entry->hdr_dva_word[1], - __entry->hdr_birth, __entry->hdr_cksum0, __entry->hdr_flags, - __entry->hdr_datacnt, __entry->hdr_type, __entry->hdr_size, + __entry->hdr_birth, __entry->hdr_flags, + __entry->hdr_datacnt, __entry->hdr_size, __entry->hdr_spa, __entry->hdr_state_type, __entry->hdr_access, __entry->hdr_mru_hits, __entry->hdr_mru_ghost_hits, __entry->hdr_mfu_hits, __entry->hdr_mfu_ghost_hits, diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 7b34e68258..e69889ab58 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -118,7 +118,7 @@ * Note that the majority of the performance stats are manipulated * with atomic operations. * - * The L2ARC uses the l2arc_buflist_mtx global mutex for the following: + * The L2ARC uses the l2ad_mtx on each vdev for the following: * * - L2ARC buflist creation * - L2ARC buflist eviction @@ -308,6 +308,7 @@ typedef struct arc_stats { kstat_named_t arcstat_l2_writes_hdr_miss; kstat_named_t arcstat_l2_evict_lock_retry; kstat_named_t arcstat_l2_evict_reading; + kstat_named_t arcstat_l2_evict_l1cached; kstat_named_t arcstat_l2_free_on_write; kstat_named_t arcstat_l2_abort_lowmem; kstat_named_t arcstat_l2_cksum_bad; @@ -396,6 +397,7 @@ static arc_stats_t arc_stats = { { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 }, { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, { "l2_evict_reading", KSTAT_DATA_UINT64 }, + { "l2_evict_l1cached", KSTAT_DATA_UINT64 }, { "l2_free_on_write", KSTAT_DATA_UINT64 }, { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, { "l2_cksum_bad", KSTAT_DATA_UINT64 }, @@ -506,22 +508,38 @@ static arc_buf_hdr_t arc_eviction_hdr; #define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH) #define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FLAG_FREED_IN_READ) #define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_FLAG_BUF_AVAILABLE) -#define HDR_FREE_IN_PROGRESS(hdr) \ - ((hdr)->b_flags & ARC_FLAG_FREE_IN_PROGRESS) + #define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE) +#define HDR_L2COMPRESS(hdr) ((hdr)->b_flags & ARC_FLAG_L2COMPRESS) #define HDR_L2_READING(hdr) \ - ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS && \ - (hdr)->b_l2hdr != NULL) + (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \ + ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)) #define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING) #define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED) #define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD) +#define HDR_ISTYPE_METADATA(hdr) \ + ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA) +#define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr)) + +#define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR) +#define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR) + +/* For storing compression mode in b_flags */ +#define HDR_COMPRESS_OFFSET 24 +#define HDR_COMPRESS_NBITS 7 + +#define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET(hdr->b_flags, \ + HDR_COMPRESS_OFFSET, HDR_COMPRESS_NBITS)) +#define HDR_SET_COMPRESS(hdr, cmp) BF32_SET(hdr->b_flags, \ + HDR_COMPRESS_OFFSET, HDR_COMPRESS_NBITS, (cmp)) + /* * Other sizes */ -#define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) -#define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t)) +#define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) +#define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr)) /* * Hash table routines @@ -591,7 +609,6 @@ static list_t L2ARC_dev_list; /* device list */ static list_t *l2arc_dev_list; /* device list pointer */ static kmutex_t l2arc_dev_mtx; /* device list mutex */ static l2arc_dev_t *l2arc_dev_last; /* last device used */ -static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */ static list_t L2ARC_free_on_write; /* free after write buf list */ static list_t *l2arc_free_on_write; /* free after write list ptr */ static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ @@ -606,19 +623,6 @@ typedef struct l2arc_read_callback { enum zio_compress l2rcb_compress; /* applied compress */ } l2arc_read_callback_t; -struct l2arc_buf_hdr { - /* protected by arc_buf_hdr mutex */ - l2arc_dev_t *b_dev; /* L2ARC device */ - uint64_t b_daddr; /* disk address, offset byte */ - /* compression applied to buffer data */ - enum zio_compress b_compress; - /* real alloc'd buffer size depending on b_compress applied */ - uint32_t b_hits; - uint64_t b_asize; - /* temporary buffer holder for in-flight compressed data */ - void *b_tmp_cdata; -}; - typedef struct l2arc_data_free { /* protected by l2arc_free_on_write_mtx */ void *l2df_data; @@ -637,12 +641,13 @@ static int arc_evict_needed(arc_buf_contents_t); static void arc_evict_ghost(arc_state_t *, uint64_t, int64_t); static void arc_buf_watch(arc_buf_t *); +static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); +static uint32_t arc_bufc_to_flags(arc_buf_contents_t); + static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); static void l2arc_read_done(zio_t *); -static void l2arc_hdr_stat_add(void); -static void l2arc_hdr_stat_remove(void); -static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *); +static boolean_t l2arc_compress_buf(arc_buf_hdr_t *); static void l2arc_decompress_zio(zio_t *, arc_buf_hdr_t *, enum zio_compress); static void l2arc_release_cdata_buf(arc_buf_hdr_t *); @@ -665,8 +670,7 @@ buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) #define BUF_EMPTY(buf) \ ((buf)->b_dva.dva_word[0] == 0 && \ - (buf)->b_dva.dva_word[1] == 0 && \ - (buf)->b_cksum0 == 0) + (buf)->b_dva.dva_word[1] == 0) #define BUF_EQUAL(spa, dva, birth, buf) \ ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ @@ -679,7 +683,6 @@ buf_discard_identity(arc_buf_hdr_t *hdr) hdr->b_dva.dva_word[0] = 0; hdr->b_dva.dva_word[1] = 0; hdr->b_birth = 0; - hdr->b_cksum0 = 0; } static arc_buf_hdr_t * @@ -709,6 +712,7 @@ buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp) * equal to elem in the hash table, then the already existing element * will be returned and the new element will not be inserted. * Otherwise returns NULL. + * If lockp == NULL, the caller is assumed to already hold the hash lock. */ static arc_buf_hdr_t * buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp) @@ -721,8 +725,14 @@ buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp) ASSERT(!DVA_IS_EMPTY(&hdr->b_dva)); ASSERT(hdr->b_birth != 0); ASSERT(!HDR_IN_HASH_TABLE(hdr)); - *lockp = hash_lock; - mutex_enter(hash_lock); + + if (lockp != NULL) { + *lockp = hash_lock; + mutex_enter(hash_lock); + } else { + ASSERT(MUTEX_HELD(hash_lock)); + } + for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL; fhdr = fhdr->b_hash_next, i++) { if (BUF_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr)) @@ -777,7 +787,8 @@ buf_hash_remove(arc_buf_hdr_t *hdr) /* * Global data structures and functions for the buf kmem cache. */ -static kmem_cache_t *hdr_cache; +static kmem_cache_t *hdr_full_cache; +static kmem_cache_t *hdr_l2only_cache; static kmem_cache_t *buf_cache; static void @@ -798,7 +809,8 @@ buf_fini(void) #endif for (i = 0; i < BUF_LOCKS; i++) mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); - kmem_cache_destroy(hdr_cache); + kmem_cache_destroy(hdr_full_cache); + kmem_cache_destroy(hdr_l2only_cache); kmem_cache_destroy(buf_cache); } @@ -808,17 +820,29 @@ buf_fini(void) */ /* ARGSUSED */ static int -hdr_cons(void *vbuf, void *unused, int kmflag) +hdr_full_cons(void *vbuf, void *unused, int kmflag) { arc_buf_hdr_t *hdr = vbuf; - bzero(hdr, sizeof (arc_buf_hdr_t)); - refcount_create(&hdr->b_refcnt); - cv_init(&hdr->b_cv, NULL, CV_DEFAULT, NULL); - mutex_init(&hdr->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); - list_link_init(&hdr->b_arc_node); - list_link_init(&hdr->b_l2node); - arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); + bzero(hdr, HDR_FULL_SIZE); + cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL); + refcount_create(&hdr->b_l1hdr.b_refcnt); + mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); + list_link_init(&hdr->b_l1hdr.b_arc_node); + list_link_init(&hdr->b_l2hdr.b_l2node); + arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS); + + return (0); +} + +/* ARGSUSED */ +static int +hdr_l2only_cons(void *vbuf, void *unused, int kmflag) +{ + arc_buf_hdr_t *hdr = vbuf; + + bzero(hdr, HDR_L2ONLY_SIZE); + arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); return (0); } @@ -842,15 +866,25 @@ buf_cons(void *vbuf, void *unused, int kmflag) */ /* ARGSUSED */ static void -hdr_dest(void *vbuf, void *unused) +hdr_full_dest(void *vbuf, void *unused) { arc_buf_hdr_t *hdr = vbuf; ASSERT(BUF_EMPTY(hdr)); - refcount_destroy(&hdr->b_refcnt); - cv_destroy(&hdr->b_cv); - mutex_destroy(&hdr->b_freeze_lock); - arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); + cv_destroy(&hdr->b_l1hdr.b_cv); + refcount_destroy(&hdr->b_l1hdr.b_refcnt); + mutex_destroy(&hdr->b_l1hdr.b_freeze_lock); + arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS); +} + +/* ARGSUSED */ +static void +hdr_l2only_dest(void *vbuf, void *unused) +{ + ASSERTV(arc_buf_hdr_t *hdr = vbuf); + + ASSERT(BUF_EMPTY(hdr)); + arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); } /* ARGSUSED */ @@ -897,8 +931,11 @@ retry: goto retry; } - hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t), - 0, hdr_cons, hdr_dest, NULL, NULL, NULL, 0); + hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE, + 0, hdr_full_cons, hdr_full_dest, NULL, NULL, NULL, 0); + hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only", + HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, NULL, + NULL, NULL, 0); buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); @@ -912,6 +949,81 @@ retry: } } +/* + * Transition between the two allocation states for the arc_buf_hdr struct. + * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without + * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller + * version is used when a cache buffer is only in the L2ARC in order to reduce + * memory usage. + */ +static arc_buf_hdr_t * +arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) +{ + arc_buf_hdr_t *nhdr; + l2arc_dev_t *dev; + + ASSERT(HDR_HAS_L2HDR(hdr)); + ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) || + (old == hdr_l2only_cache && new == hdr_full_cache)); + + dev = hdr->b_l2hdr.b_dev; + nhdr = kmem_cache_alloc(new, KM_PUSHPAGE); + + ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); + buf_hash_remove(hdr); + + bcopy(hdr, nhdr, HDR_L2ONLY_SIZE); + if (new == hdr_full_cache) { + nhdr->b_flags |= ARC_FLAG_HAS_L1HDR; + /* + * arc_access and arc_change_state need to be aware that a + * header has just come out of L2ARC, so we set its state to + * l2c_only even though it's about to change. + */ + nhdr->b_l1hdr.b_state = arc_l2c_only; + } else { + ASSERT(hdr->b_l1hdr.b_buf == NULL); + ASSERT0(hdr->b_l1hdr.b_datacnt); + ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); + /* + * We might be removing the L1hdr of a buffer which was just + * written out to L2ARC. If such a buffer is compressed then we + * need to free its b_tmp_cdata before destroying the header. + */ + if (hdr->b_l1hdr.b_tmp_cdata != NULL && + HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) + l2arc_release_cdata_buf(hdr); + nhdr->b_flags &= ~ARC_FLAG_HAS_L1HDR; + } + /* + * The header has been reallocated so we need to re-insert it into any + * lists it was on. + */ + (void) buf_hash_insert(nhdr, NULL); + + ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node)); + + mutex_enter(&dev->l2ad_mtx); + + /* + * We must place the realloc'ed header back into the list at + * the same spot. Otherwise, if it's placed earlier in the list, + * l2arc_write_buffers() could find it during the function's + * write phase, and try to write it out to the l2arc. + */ + list_insert_after(&dev->l2ad_buflist, hdr, nhdr); + list_remove(&dev->l2ad_buflist, hdr); + + mutex_exit(&dev->l2ad_mtx); + + buf_discard_identity(hdr); + hdr->b_freeze_cksum = NULL; + kmem_cache_free(old, hdr); + + return (nhdr); +} + + #define ARC_MINTIME (hz>>4) /* 62 ms */ static void @@ -922,16 +1034,15 @@ arc_cksum_verify(arc_buf_t *buf) if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; - mutex_enter(&buf->b_hdr->b_freeze_lock); - if (buf->b_hdr->b_freeze_cksum == NULL || - (buf->b_hdr->b_flags & ARC_FLAG_IO_ERROR)) { - mutex_exit(&buf->b_hdr->b_freeze_lock); + mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); + if (buf->b_hdr->b_freeze_cksum == NULL || HDR_IO_ERROR(buf->b_hdr)) { + mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); return; } fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) panic("buffer modified while frozen!"); - mutex_exit(&buf->b_hdr->b_freeze_lock); + mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); } static int @@ -940,10 +1051,10 @@ arc_cksum_equal(arc_buf_t *buf) zio_cksum_t zc; int equal; - mutex_enter(&buf->b_hdr->b_freeze_lock); + mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc); - mutex_exit(&buf->b_hdr->b_freeze_lock); + mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); return (equal); } @@ -954,16 +1065,16 @@ arc_cksum_compute(arc_buf_t *buf, boolean_t force) if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY)) return; - mutex_enter(&buf->b_hdr->b_freeze_lock); + mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); if (buf->b_hdr->b_freeze_cksum != NULL) { - mutex_exit(&buf->b_hdr->b_freeze_lock); + mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); return; } buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); fletcher_2_native(buf->b_data, buf->b_hdr->b_size, buf->b_hdr->b_freeze_cksum); - mutex_exit(&buf->b_hdr->b_freeze_lock); + mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); arc_buf_watch(buf); } @@ -997,24 +1108,50 @@ arc_buf_watch(arc_buf_t *buf) #endif } +static arc_buf_contents_t +arc_buf_type(arc_buf_hdr_t *hdr) +{ + if (HDR_ISTYPE_METADATA(hdr)) { + return (ARC_BUFC_METADATA); + } else { + return (ARC_BUFC_DATA); + } +} + +static uint32_t +arc_bufc_to_flags(arc_buf_contents_t type) +{ + switch (type) { + case ARC_BUFC_DATA: + /* metadata field is 0 if buffer contains normal data */ + return (0); + case ARC_BUFC_METADATA: + return (ARC_FLAG_BUFC_METADATA); + default: + break; + } + panic("undefined ARC buffer type!"); + return ((uint32_t)-1); +} + void arc_buf_thaw(arc_buf_t *buf) { if (zfs_flags & ZFS_DEBUG_MODIFY) { - if (buf->b_hdr->b_state != arc_anon) + if (buf->b_hdr->b_l1hdr.b_state != arc_anon) panic("modifying non-anon buffer!"); - if (buf->b_hdr->b_flags & ARC_FLAG_IO_IN_PROGRESS) + if (HDR_IO_IN_PROGRESS(buf->b_hdr)) panic("modifying buffer while i/o in progress!"); arc_cksum_verify(buf); } - mutex_enter(&buf->b_hdr->b_freeze_lock); + mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); if (buf->b_hdr->b_freeze_cksum != NULL) { kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); buf->b_hdr->b_freeze_cksum = NULL; } - mutex_exit(&buf->b_hdr->b_freeze_lock); + mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); arc_buf_unwatch(buf); } @@ -1031,7 +1168,7 @@ arc_buf_freeze(arc_buf_t *buf) mutex_enter(hash_lock); ASSERT(buf->b_hdr->b_freeze_cksum != NULL || - buf->b_hdr->b_state == arc_anon); + buf->b_hdr->b_l1hdr.b_state == arc_anon); arc_cksum_compute(buf, B_FALSE); mutex_exit(hash_lock); @@ -1040,30 +1177,37 @@ arc_buf_freeze(arc_buf_t *buf) static void add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) { + arc_state_t *state; + + ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(MUTEX_HELD(hash_lock)); - if ((refcount_add(&hdr->b_refcnt, tag) == 1) && - (hdr->b_state != arc_anon)) { - uint64_t delta = hdr->b_size * hdr->b_datacnt; - list_t *list = &hdr->b_state->arcs_list[hdr->b_type]; - uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type]; + state = hdr->b_l1hdr.b_state; - ASSERT(!MUTEX_HELD(&hdr->b_state->arcs_mtx)); - mutex_enter(&hdr->b_state->arcs_mtx); - ASSERT(list_link_active(&hdr->b_arc_node)); - list_remove(list, hdr); - if (GHOST_STATE(hdr->b_state)) { - ASSERT0(hdr->b_datacnt); - ASSERT3P(hdr->b_buf, ==, NULL); - delta = hdr->b_size; + if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) && + (state != arc_anon)) { + /* We don't use the L2-only state list. */ + if (state != arc_l2c_only) { + uint64_t delta = hdr->b_size * hdr->b_l1hdr.b_datacnt; + list_t *list = &state->arcs_list[arc_buf_type(hdr)]; + uint64_t *size = &state->arcs_lsize[arc_buf_type(hdr)]; + + ASSERT(!MUTEX_HELD(&state->arcs_mtx)); + mutex_enter(&state->arcs_mtx); + ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node)); + list_remove(list, hdr); + if (GHOST_STATE(state)) { + ASSERT0(hdr->b_l1hdr.b_datacnt); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + delta = hdr->b_size; + } + ASSERT(delta > 0); + ASSERT3U(*size, >=, delta); + atomic_add_64(size, -delta); + mutex_exit(&state->arcs_mtx); } - ASSERT(delta > 0); - ASSERT3U(*size, >=, delta); - atomic_add_64(size, -delta); - mutex_exit(&hdr->b_state->arcs_mtx); /* remove the prefetch flag if we get a reference */ - if (hdr->b_flags & ARC_FLAG_PREFETCH) - hdr->b_flags &= ~ARC_FLAG_PREFETCH; + hdr->b_flags &= ~ARC_FLAG_PREFETCH; } } @@ -1071,21 +1215,27 @@ static int remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) { int cnt; - arc_state_t *state = hdr->b_state; + arc_state_t *state = hdr->b_l1hdr.b_state; + ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); ASSERT(!GHOST_STATE(state)); - if (((cnt = refcount_remove(&hdr->b_refcnt, tag)) == 0) && + /* + * arc_l2c_only counts as a ghost state so we don't need to explicitly + * check to prevent usage of the arc_l2c_only list. + */ + if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) && (state != arc_anon)) { - uint64_t *size = &state->arcs_lsize[hdr->b_type]; + uint64_t *size = &state->arcs_lsize[arc_buf_type(hdr)]; ASSERT(!MUTEX_HELD(&state->arcs_mtx)); mutex_enter(&state->arcs_mtx); - ASSERT(!list_link_active(&hdr->b_arc_node)); - list_insert_head(&state->arcs_list[hdr->b_type], hdr); - ASSERT(hdr->b_datacnt > 0); - atomic_add_64(size, hdr->b_size * hdr->b_datacnt); + ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); + list_insert_head(&state->arcs_list[arc_buf_type(hdr)], hdr); + ASSERT(hdr->b_l1hdr.b_datacnt > 0); + atomic_add_64(size, hdr->b_size * + hdr->b_l1hdr.b_datacnt); mutex_exit(&state->arcs_mtx); } return (cnt); @@ -1102,31 +1252,45 @@ void arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index) { arc_buf_hdr_t *hdr = ab->b_hdr; - arc_state_t *state = hdr->b_state; + l1arc_buf_hdr_t *l1hdr = NULL; + l2arc_buf_hdr_t *l2hdr = NULL; + arc_state_t *state = NULL; + + if (HDR_HAS_L1HDR(hdr)) { + l1hdr = &hdr->b_l1hdr; + state = l1hdr->b_state; + } + if (HDR_HAS_L2HDR(hdr)) + l2hdr = &hdr->b_l2hdr; memset(abi, 0, sizeof (arc_buf_info_t)); abi->abi_flags = hdr->b_flags; - abi->abi_datacnt = hdr->b_datacnt; - abi->abi_state_type = state ? state->arcs_state : ARC_STATE_ANON; - abi->abi_state_contents = hdr->b_type; - abi->abi_state_index = -1; - abi->abi_size = hdr->b_size; - abi->abi_access = hdr->b_arc_access; - abi->abi_mru_hits = hdr->b_mru_hits; - abi->abi_mru_ghost_hits = hdr->b_mru_ghost_hits; - abi->abi_mfu_hits = hdr->b_mfu_hits; - abi->abi_mfu_ghost_hits = hdr->b_mfu_ghost_hits; - abi->abi_holds = refcount_count(&hdr->b_refcnt); - if (hdr->b_l2hdr) { - abi->abi_l2arc_dattr = hdr->b_l2hdr->b_daddr; - abi->abi_l2arc_asize = hdr->b_l2hdr->b_asize; - abi->abi_l2arc_compress = hdr->b_l2hdr->b_compress; - abi->abi_l2arc_hits = hdr->b_l2hdr->b_hits; + if (l1hdr) { + abi->abi_datacnt = l1hdr->b_datacnt; + abi->abi_access = l1hdr->b_arc_access; + abi->abi_mru_hits = l1hdr->b_mru_hits; + abi->abi_mru_ghost_hits = l1hdr->b_mru_ghost_hits; + abi->abi_mfu_hits = l1hdr->b_mfu_hits; + abi->abi_mfu_ghost_hits = l1hdr->b_mfu_ghost_hits; + abi->abi_holds = refcount_count(&l1hdr->b_refcnt); } - if (state && state_index && list_link_active(&hdr->b_arc_node)) { - list_t *list = &state->arcs_list[hdr->b_type]; + if (l2hdr) { + abi->abi_l2arc_dattr = l2hdr->b_daddr; + abi->abi_l2arc_asize = l2hdr->b_asize; + abi->abi_l2arc_compress = HDR_GET_COMPRESS(hdr); + abi->abi_l2arc_hits = l2hdr->b_hits; + } + + abi->abi_state_type = state ? state->arcs_state : ARC_STATE_ANON; + abi->abi_state_contents = arc_buf_type(hdr); + abi->abi_state_index = -1; + abi->abi_size = hdr->b_size; + + if (l1hdr && state && state_index && + list_link_active(&l1hdr->b_arc_node)) { + list_t *list = &state->arcs_list[arc_buf_type(hdr)]; arc_buf_hdr_t *h; mutex_enter(&state->arcs_mtx); @@ -1147,40 +1311,60 @@ static void arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, kmutex_t *hash_lock) { - arc_state_t *old_state = hdr->b_state; - int64_t refcnt = refcount_count(&hdr->b_refcnt); + arc_state_t *old_state; + int64_t refcnt; + uint32_t datacnt; uint64_t from_delta, to_delta; + arc_buf_contents_t buftype = arc_buf_type(hdr); + + /* + * We almost always have an L1 hdr here, since we call arc_hdr_realloc() + * in arc_read() when bringing a buffer out of the L2ARC. However, the + * L1 hdr doesn't always exist when we change state to arc_anon before + * destroying a header, in which case reallocating to add the L1 hdr is + * pointless. + */ + if (HDR_HAS_L1HDR(hdr)) { + old_state = hdr->b_l1hdr.b_state; + refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt); + datacnt = hdr->b_l1hdr.b_datacnt; + } else { + old_state = arc_l2c_only; + refcnt = 0; + datacnt = 0; + } ASSERT(MUTEX_HELD(hash_lock)); ASSERT3P(new_state, !=, old_state); - ASSERT(refcnt == 0 || hdr->b_datacnt > 0); - ASSERT(hdr->b_datacnt == 0 || !GHOST_STATE(new_state)); - ASSERT(hdr->b_datacnt <= 1 || old_state != arc_anon); + ASSERT(refcnt == 0 || datacnt > 0); + ASSERT(!GHOST_STATE(new_state) || datacnt == 0); + ASSERT(old_state != arc_anon || datacnt <= 1); - from_delta = to_delta = hdr->b_datacnt * hdr->b_size; + from_delta = to_delta = datacnt * hdr->b_size; /* * If this buffer is evictable, transfer it from the * old state list to the new state list. */ if (refcnt == 0) { - if (old_state != arc_anon) { + if (old_state != arc_anon && old_state != arc_l2c_only) { int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx); - uint64_t *size = &old_state->arcs_lsize[hdr->b_type]; + uint64_t *size = &old_state->arcs_lsize[buftype]; if (use_mutex) mutex_enter(&old_state->arcs_mtx); - ASSERT(list_link_active(&hdr->b_arc_node)); - list_remove(&old_state->arcs_list[hdr->b_type], hdr); + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node)); + list_remove(&old_state->arcs_list[buftype], hdr); /* * If prefetching out of the ghost cache, * we will have a non-zero datacnt. */ - if (GHOST_STATE(old_state) && hdr->b_datacnt == 0) { + if (GHOST_STATE(old_state) && datacnt == 0) { /* ghost elements have a ghost size */ - ASSERT(hdr->b_buf == NULL); + ASSERT(hdr->b_l1hdr.b_buf == NULL); from_delta = hdr->b_size; } ASSERT3U(*size, >=, from_delta); @@ -1189,20 +1373,26 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, if (use_mutex) mutex_exit(&old_state->arcs_mtx); } - if (new_state != arc_anon) { + if (new_state != arc_anon && new_state != arc_l2c_only) { int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx); - uint64_t *size = &new_state->arcs_lsize[hdr->b_type]; + uint64_t *size = &new_state->arcs_lsize[buftype]; + /* + * An L1 header always exists here, since if we're + * moving to some L1-cached state (i.e. not l2c_only or + * anonymous), we realloc the header to add an L1hdr + * beforehand. + */ + ASSERT(HDR_HAS_L1HDR(hdr)); if (use_mutex) mutex_enter(&new_state->arcs_mtx); - list_insert_head(&new_state->arcs_list[hdr->b_type], - hdr); + list_insert_head(&new_state->arcs_list[buftype], hdr); /* ghost elements have a ghost size */ if (GHOST_STATE(new_state)) { - ASSERT(hdr->b_datacnt == 0); - ASSERT(hdr->b_buf == NULL); + ASSERT0(datacnt); + ASSERT(hdr->b_l1hdr.b_buf == NULL); to_delta = hdr->b_size; } atomic_add_64(size, to_delta); @@ -1216,20 +1406,22 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr)) buf_hash_remove(hdr); - /* adjust state sizes */ - if (to_delta) + /* adjust state sizes (ignore arc_l2c_only) */ + if (to_delta && new_state != arc_l2c_only) atomic_add_64(&new_state->arcs_size, to_delta); - if (from_delta) { + if (from_delta && old_state != arc_l2c_only) { ASSERT3U(old_state->arcs_size, >=, from_delta); atomic_add_64(&old_state->arcs_size, -from_delta); } - hdr->b_state = new_state; + if (HDR_HAS_L1HDR(hdr)) + hdr->b_l1hdr.b_state = new_state; - /* adjust l2arc hdr stats */ - if (new_state == arc_l2c_only) - l2arc_hdr_stat_add(); - else if (old_state == arc_l2c_only) - l2arc_hdr_stat_remove(); + /* + * L2 headers should never be on the L2 state list since they don't + * have L1 headers allocated. + */ + ASSERT(list_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) && + list_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA])); } void @@ -1307,30 +1499,36 @@ arc_buf_alloc(spa_t *spa, uint64_t size, void *tag, arc_buf_contents_t type) arc_buf_t *buf; VERIFY3U(size, <=, spa_maxblocksize(spa)); - hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); + hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); ASSERT(BUF_EMPTY(hdr)); + ASSERT3P(hdr->b_freeze_cksum, ==, NULL); hdr->b_size = size; - hdr->b_type = type; hdr->b_spa = spa_load_guid(spa); - hdr->b_state = arc_anon; - hdr->b_arc_access = 0; - hdr->b_mru_hits = 0; - hdr->b_mru_ghost_hits = 0; - hdr->b_mfu_hits = 0; - hdr->b_mfu_ghost_hits = 0; - hdr->b_l2_hits = 0; + hdr->b_l1hdr.b_mru_hits = 0; + hdr->b_l1hdr.b_mru_ghost_hits = 0; + hdr->b_l1hdr.b_mfu_hits = 0; + hdr->b_l1hdr.b_mfu_ghost_hits = 0; + hdr->b_l1hdr.b_l2_hits = 0; + buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; buf->b_efunc = NULL; buf->b_private = NULL; buf->b_next = NULL; - hdr->b_buf = buf; + + hdr->b_flags = arc_bufc_to_flags(type); + hdr->b_flags |= ARC_FLAG_HAS_L1HDR; + + hdr->b_l1hdr.b_buf = buf; + hdr->b_l1hdr.b_state = arc_anon; + hdr->b_l1hdr.b_arc_access = 0; + hdr->b_l1hdr.b_datacnt = 1; + arc_get_data_buf(buf); - hdr->b_datacnt = 1; - hdr->b_flags = 0; - ASSERT(refcount_is_zero(&hdr->b_refcnt)); - (void) refcount_add(&hdr->b_refcnt, tag); + + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); return (buf); } @@ -1363,8 +1561,9 @@ arc_return_buf(arc_buf_t *buf, void *tag) arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT(buf->b_data != NULL); - (void) refcount_add(&hdr->b_refcnt, tag); - (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag); + ASSERT(HDR_HAS_L1HDR(hdr)); + (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); + (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); atomic_add_64(&arc_loaned_bytes, -hdr->b_size); } @@ -1373,12 +1572,12 @@ arc_return_buf(arc_buf_t *buf, void *tag) void arc_loan_inuse_buf(arc_buf_t *buf, void *tag) { - arc_buf_hdr_t *hdr; + arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT(buf->b_data != NULL); - hdr = buf->b_hdr; - (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag); - (void) refcount_remove(&hdr->b_refcnt, tag); + ASSERT(HDR_HAS_L1HDR(hdr)); + (void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); + (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag); buf->b_efunc = NULL; buf->b_private = NULL; @@ -1392,15 +1591,16 @@ arc_buf_clone(arc_buf_t *from) arc_buf_hdr_t *hdr = from->b_hdr; uint64_t size = hdr->b_size; - ASSERT(hdr->b_state != arc_anon); + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(hdr->b_l1hdr.b_state != arc_anon); buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; buf->b_efunc = NULL; buf->b_private = NULL; - buf->b_next = hdr->b_buf; - hdr->b_buf = buf; + buf->b_next = hdr->b_l1hdr.b_buf; + hdr->b_l1hdr.b_buf = buf; arc_get_data_buf(buf); bcopy(from->b_data, buf->b_data, size); @@ -1410,11 +1610,11 @@ arc_buf_clone(arc_buf_t *from) * then track the size and number of duplicates. These stats will be * updated as duplicate buffers are created and destroyed. */ - if (hdr->b_type == ARC_BUFC_DATA) { + if (HDR_ISTYPE_DATA(hdr)) { ARCSTAT_BUMP(arcstat_duplicate_buffers); ARCSTAT_INCR(arcstat_duplicate_buffers_size, size); } - hdr->b_datacnt += 1; + hdr->b_l1hdr.b_datacnt += 1; return (buf); } @@ -1437,17 +1637,20 @@ arc_buf_add_ref(arc_buf_t *buf, void* tag) hash_lock = HDR_LOCK(buf->b_hdr); mutex_enter(hash_lock); hdr = buf->b_hdr; + ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); mutex_exit(&buf->b_evict_lock); - ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); + ASSERT(hdr->b_l1hdr.b_state == arc_mru || + hdr->b_l1hdr.b_state == arc_mfu); + add_reference(hdr, hash_lock, tag); DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); arc_access(hdr, hash_lock); mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); - ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_FLAG_PREFETCH), - demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, + ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), + demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, hits); } @@ -1485,10 +1688,10 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove) arc_buf_t **bufp; /* free up data associated with the buf */ - if (buf->b_data) { - arc_state_t *state = buf->b_hdr->b_state; + if (buf->b_data != NULL) { + arc_state_t *state = buf->b_hdr->b_l1hdr.b_state; uint64_t size = buf->b_hdr->b_size; - arc_buf_contents_t type = buf->b_hdr->b_type; + arc_buf_contents_t type = arc_buf_type(buf->b_hdr); arc_cksum_verify(buf); arc_buf_unwatch(buf); @@ -1503,11 +1706,12 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove) arc_space_return(size, ARC_SPACE_DATA); } } - if (list_link_active(&buf->b_hdr->b_arc_node)) { + if (list_link_active(&buf->b_hdr->b_l1hdr.b_arc_node)) { uint64_t *cnt = &state->arcs_lsize[type]; - ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt)); - ASSERT(state != arc_anon); + ASSERT(refcount_is_zero( + &buf->b_hdr->b_l1hdr.b_refcnt)); + ASSERT(state != arc_anon && state != arc_l2c_only); ASSERT3U(*cnt, >=, size); atomic_add_64(cnt, -size); @@ -1520,13 +1724,13 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove) * If we're destroying a duplicate buffer make sure * that the appropriate statistics are updated. */ - if (buf->b_hdr->b_datacnt > 1 && - buf->b_hdr->b_type == ARC_BUFC_DATA) { + if (buf->b_hdr->b_l1hdr.b_datacnt > 1 && + HDR_ISTYPE_DATA(buf->b_hdr)) { ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size); } - ASSERT(buf->b_hdr->b_datacnt > 0); - buf->b_hdr->b_datacnt -= 1; + ASSERT(buf->b_hdr->b_l1hdr.b_datacnt > 0); + buf->b_hdr->b_l1hdr.b_datacnt -= 1; } /* only remove the buf if requested */ @@ -1534,7 +1738,8 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove) return; /* remove the buf from the hdr list */ - for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next) + for (bufp = &buf->b_hdr->b_l1hdr.b_buf; *bufp != buf; + bufp = &(*bufp)->b_next) continue; *bufp = buf->b_next; buf->b_next = NULL; @@ -1549,84 +1754,82 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove) static void arc_hdr_destroy(arc_buf_hdr_t *hdr) { - l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr; - - ASSERT(refcount_is_zero(&hdr->b_refcnt)); - ASSERT3P(hdr->b_state, ==, arc_anon); + if (HDR_HAS_L1HDR(hdr)) { + ASSERT(hdr->b_l1hdr.b_buf == NULL || + hdr->b_l1hdr.b_datacnt > 0); + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); + } ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + ASSERT(!HDR_IN_HASH_TABLE(hdr)); + + if (HDR_HAS_L2HDR(hdr)) { + l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; + boolean_t buflist_held = MUTEX_HELD(&l2hdr->b_dev->l2ad_mtx); - if (l2hdr != NULL) { - boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx); - /* - * To prevent arc_free() and l2arc_evict() from - * attempting to free the same buffer at the same time, - * a FREE_IN_PROGRESS flag is given to arc_free() to - * give it priority. l2arc_evict() can't destroy this - * header while we are waiting on l2arc_buflist_mtx. - * - * The hdr may be removed from l2ad_buflist before we - * grab l2arc_buflist_mtx, so b_l2hdr is rechecked. - */ if (!buflist_held) { - mutex_enter(&l2arc_buflist_mtx); - l2hdr = hdr->b_l2hdr; + mutex_enter(&l2hdr->b_dev->l2ad_mtx); + l2hdr = &hdr->b_l2hdr; } - if (l2hdr != NULL) { - list_remove(l2hdr->b_dev->l2ad_buflist, hdr); - ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); - ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); - vdev_space_update(l2hdr->b_dev->l2ad_vdev, - -l2hdr->b_asize, 0, 0); - kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); - arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS); - if (hdr->b_state == arc_l2c_only) - l2arc_hdr_stat_remove(); - hdr->b_l2hdr = NULL; - } + list_remove(&l2hdr->b_dev->l2ad_buflist, hdr); + + arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); + ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); + ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); if (!buflist_held) - mutex_exit(&l2arc_buflist_mtx); + mutex_exit(&l2hdr->b_dev->l2ad_mtx); + + hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; } - if (!BUF_EMPTY(hdr)) { - ASSERT(!HDR_IN_HASH_TABLE(hdr)); + if (!BUF_EMPTY(hdr)) buf_discard_identity(hdr); - } - while (hdr->b_buf) { - arc_buf_t *buf = hdr->b_buf; - if (buf->b_efunc) { - mutex_enter(&arc_eviction_mtx); - mutex_enter(&buf->b_evict_lock); - ASSERT(buf->b_hdr != NULL); - arc_buf_destroy(hdr->b_buf, FALSE, FALSE); - hdr->b_buf = buf->b_next; - buf->b_hdr = &arc_eviction_hdr; - buf->b_next = arc_eviction_list; - arc_eviction_list = buf; - mutex_exit(&buf->b_evict_lock); - mutex_exit(&arc_eviction_mtx); - } else { - arc_buf_destroy(hdr->b_buf, FALSE, TRUE); - } - } if (hdr->b_freeze_cksum != NULL) { kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); hdr->b_freeze_cksum = NULL; } - ASSERT(!list_link_active(&hdr->b_arc_node)); + if (HDR_HAS_L1HDR(hdr)) { + while (hdr->b_l1hdr.b_buf) { + arc_buf_t *buf = hdr->b_l1hdr.b_buf; + + if (buf->b_efunc != NULL) { + mutex_enter(&arc_eviction_mtx); + mutex_enter(&buf->b_evict_lock); + ASSERT(buf->b_hdr != NULL); + arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE, + FALSE); + hdr->b_l1hdr.b_buf = buf->b_next; + buf->b_hdr = &arc_eviction_hdr; + buf->b_next = arc_eviction_list; + arc_eviction_list = buf; + mutex_exit(&buf->b_evict_lock); + mutex_exit(&arc_eviction_mtx); + } else { + arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE, + TRUE); + } + } + } + ASSERT3P(hdr->b_hash_next, ==, NULL); - ASSERT3P(hdr->b_acb, ==, NULL); - kmem_cache_free(hdr_cache, hdr); + if (HDR_HAS_L1HDR(hdr)) { + ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); + ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); + kmem_cache_free(hdr_full_cache, hdr); + } else { + kmem_cache_free(hdr_l2only_cache, hdr); + } } void arc_buf_free(arc_buf_t *buf, void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; - int hashed = hdr->b_state != arc_anon; + int hashed = hdr->b_l1hdr.b_state != arc_anon; ASSERT(buf->b_efunc == NULL); ASSERT(buf->b_data != NULL); @@ -1639,10 +1842,10 @@ arc_buf_free(arc_buf_t *buf, void *tag) ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); (void) remove_reference(hdr, hash_lock, tag); - if (hdr->b_datacnt > 1) { + if (hdr->b_l1hdr.b_datacnt > 1) { arc_buf_destroy(buf, FALSE, TRUE); } else { - ASSERT(buf == hdr->b_buf); + ASSERT(buf == hdr->b_l1hdr.b_buf); ASSERT(buf->b_efunc == NULL); hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; } @@ -1656,7 +1859,7 @@ arc_buf_free(arc_buf_t *buf, void *tag) */ mutex_enter(&arc_eviction_mtx); (void) remove_reference(hdr, NULL, tag); - ASSERT(refcount_is_zero(&hdr->b_refcnt)); + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); mutex_exit(&arc_eviction_mtx); if (destroy_hdr) @@ -1676,8 +1879,8 @@ arc_buf_remove_ref(arc_buf_t *buf, void* tag) kmutex_t *hash_lock = NULL; boolean_t no_callback = (buf->b_efunc == NULL); - if (hdr->b_state == arc_anon) { - ASSERT(hdr->b_datacnt == 1); + if (hdr->b_l1hdr.b_state == arc_anon) { + ASSERT(hdr->b_l1hdr.b_datacnt == 1); arc_buf_free(buf, tag); return (no_callback); } @@ -1685,21 +1888,22 @@ arc_buf_remove_ref(arc_buf_t *buf, void* tag) hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); hdr = buf->b_hdr; + ASSERT(hdr->b_l1hdr.b_datacnt > 0); ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); - ASSERT(hdr->b_state != arc_anon); + ASSERT(hdr->b_l1hdr.b_state != arc_anon); ASSERT(buf->b_data != NULL); (void) remove_reference(hdr, hash_lock, tag); - if (hdr->b_datacnt > 1) { + if (hdr->b_l1hdr.b_datacnt > 1) { if (no_callback) arc_buf_destroy(buf, FALSE, TRUE); } else if (no_callback) { - ASSERT(hdr->b_buf == buf && buf->b_next == NULL); + ASSERT(hdr->b_l1hdr.b_buf == buf && buf->b_next == NULL); ASSERT(buf->b_efunc == NULL); hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; } - ASSERT(no_callback || hdr->b_datacnt > 1 || - refcount_is_zero(&hdr->b_refcnt)); + ASSERT(no_callback || hdr->b_l1hdr.b_datacnt > 1 || + refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); mutex_exit(hash_lock); return (no_callback); } @@ -1745,7 +1949,7 @@ arc_buf_eviction_needed(arc_buf_t *buf) return (B_TRUE); } - if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA) + if (hdr->b_l1hdr.b_datacnt > 1 && HDR_ISTYPE_DATA(hdr)) evict_needed = B_TRUE; mutex_exit(&buf->b_evict_lock); @@ -1784,16 +1988,27 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; top: - mutex_enter(&state->arcs_mtx); + /* + * The ghost list lock must be acquired first in order to prevent + * a 3 party deadlock: + * + * - arc_evict_ghost acquires arc_*_ghost->arcs_mtx, followed by + * l2ad_mtx in arc_hdr_realloc + * - l2arc_write_buffers acquires l2ad_mtx, followed by arc_*->arcs_mtx + * - arc_evict acquires arc_*_ghost->arcs_mtx, followed by + * arc_*_ghost->arcs_mtx and forms a deadlock cycle. + * + * This situation is avoided by acquiring the ghost list lock first. + */ mutex_enter(&evicted_state->arcs_mtx); + mutex_enter(&state->arcs_mtx); for (hdr = list_tail(list); hdr; hdr = hdr_prev) { hdr_prev = list_prev(list, hdr); /* prefetch buffers have a minimum lifespan */ if (HDR_IO_IN_PROGRESS(hdr) || - (spa && hdr->b_spa != spa) || - (hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT) && - ddi_get_lbolt() - hdr->b_arc_access < + ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) && + ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < zfs_arc_min_prefetch_lifespan)) { skipped++; continue; @@ -1818,11 +2033,11 @@ top: */ if (!recycle && count++ > arc_evict_iterations) { list_insert_after(list, hdr, &marker); - mutex_exit(&evicted_state->arcs_mtx); mutex_exit(&state->arcs_mtx); + mutex_exit(&evicted_state->arcs_mtx); kpreempt(KPREEMPT_SYNC); - mutex_enter(&state->arcs_mtx); mutex_enter(&evicted_state->arcs_mtx); + mutex_enter(&state->arcs_mtx); hdr_prev = list_prev(list, &marker); list_remove(list, &marker); count = 0; @@ -1832,28 +2047,29 @@ top: hash_lock = HDR_LOCK(hdr); have_lock = MUTEX_HELD(hash_lock); if (have_lock || mutex_tryenter(hash_lock)) { - ASSERT0(refcount_count(&hdr->b_refcnt)); - ASSERT(hdr->b_datacnt > 0); - while (hdr->b_buf) { - arc_buf_t *buf = hdr->b_buf; + ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); + ASSERT3U(hdr->b_l1hdr.b_datacnt, >, 0); + while (hdr->b_l1hdr.b_buf) { + arc_buf_t *buf = hdr->b_l1hdr.b_buf; if (!mutex_tryenter(&buf->b_evict_lock)) { missed += 1; break; } - if (buf->b_data) { + if (buf->b_data != NULL) { bytes_evicted += hdr->b_size; - if (recycle && hdr->b_type == type && + if (recycle && + arc_buf_type(hdr) == type && hdr->b_size == bytes && !HDR_L2_WRITING(hdr)) { stolen = buf->b_data; recycle = FALSE; } } - if (buf->b_efunc) { + if (buf->b_efunc != NULL) { mutex_enter(&arc_eviction_mtx); arc_buf_destroy(buf, buf->b_data == stolen, FALSE); - hdr->b_buf = buf->b_next; + hdr->b_l1hdr.b_buf = buf->b_next; buf->b_hdr = &arc_eviction_hdr; buf->b_next = arc_eviction_list; arc_eviction_list = buf; @@ -1866,7 +2082,7 @@ top: } } - if (hdr->b_l2hdr) { + if (HDR_HAS_L2HDR(hdr)) { ARCSTAT_INCR(arcstat_evict_l2_cached, hdr->b_size); } else { @@ -1880,7 +2096,7 @@ top: } } - if (hdr->b_datacnt == 0) { + if (hdr->b_l1hdr.b_datacnt == 0) { arc_change_state(evicted_state, hdr, hash_lock); ASSERT(HDR_IN_HASH_TABLE(hdr)); hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE; @@ -1896,8 +2112,8 @@ top: } } - mutex_exit(&evicted_state->arcs_mtx); mutex_exit(&state->arcs_mtx); + mutex_exit(&evicted_state->arcs_mtx); if (list == &state->arcs_list[ARC_BUFC_DATA] && (bytes < 0 || bytes_evicted < bytes)) { @@ -1948,7 +2164,7 @@ top: mutex_enter(&state->arcs_mtx); for (hdr = list_tail(list); hdr; hdr = hdr_prev) { hdr_prev = list_prev(list, hdr); - if (hdr->b_type > ARC_BUFC_NUMTYPES) + if (arc_buf_type(hdr) >= ARC_BUFC_NUMTYPES) panic("invalid hdr=%p", (void *)hdr); if (spa && hdr->b_spa != spa) continue; @@ -1980,16 +2196,23 @@ top: } if (mutex_tryenter(hash_lock)) { ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - ASSERT(hdr->b_buf == NULL); + ASSERT(!HDR_HAS_L1HDR(hdr) || + hdr->b_l1hdr.b_buf == NULL); ARCSTAT_BUMP(arcstat_deleted); bytes_deleted += hdr->b_size; - if (hdr->b_l2hdr != NULL) { + if (HDR_HAS_L2HDR(hdr)) { /* * This buffer is cached on the 2nd Level ARC; * don't destroy the header. */ arc_change_state(arc_l2c_only, hdr, hash_lock); + /* + * dropping from L1+L2 cached to L2-only, + * realloc to remove the L1 header. + */ + hdr = arc_hdr_realloc(hdr, hdr_full_cache, + hdr_l2only_cache); mutex_exit(hash_lock); } else { arc_change_state(arc_anon, hdr, hash_lock); @@ -2253,27 +2476,27 @@ arc_flush(spa_t *spa) { uint64_t guid = 0; - if (spa) + if (spa != NULL) guid = spa_load_guid(spa); while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) { (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA); - if (spa) + if (spa != NULL) break; } while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) { (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA); - if (spa) + if (spa != NULL) break; } while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) { (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA); - if (spa) + if (spa != NULL) break; } while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) { (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA); - if (spa) + if (spa != NULL) break; } @@ -2345,8 +2568,8 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat, uint64_t bytes) } } - kmem_cache_reap_now(buf_cache); - kmem_cache_reap_now(hdr_cache); + kmem_cache_reap_now(hdr_full_cache); + kmem_cache_reap_now(hdr_l2only_cache); } /* @@ -2689,9 +2912,9 @@ arc_evict_needed(arc_buf_contents_t type) static void arc_get_data_buf(arc_buf_t *buf) { - arc_state_t *state = buf->b_hdr->b_state; + arc_state_t *state = buf->b_hdr->b_l1hdr.b_state; uint64_t size = buf->b_hdr->b_size; - arc_buf_contents_t type = buf->b_hdr->b_type; + arc_buf_contents_t type = arc_buf_type(buf->b_hdr); arc_buf_contents_t evict = ARC_BUFC_DATA; boolean_t recycle = TRUE; @@ -2718,8 +2941,7 @@ arc_get_data_buf(arc_buf_t *buf) * will end up on the mru list; so steal space from there. */ if (state == arc_mfu_ghost) - state = buf->b_hdr->b_flags & ARC_FLAG_PREFETCH ? - arc_mru : arc_mfu; + state = HDR_PREFETCH(buf->b_hdr) ? arc_mru : arc_mfu; else if (state == arc_mru_ghost) state = arc_mru; @@ -2782,20 +3004,21 @@ out: * Update the state size. Note that ghost states have a * "ghost size" and so don't need to be updated. */ - if (!GHOST_STATE(buf->b_hdr->b_state)) { + if (!GHOST_STATE(buf->b_hdr->b_l1hdr.b_state)) { arc_buf_hdr_t *hdr = buf->b_hdr; - atomic_add_64(&hdr->b_state->arcs_size, size); - if (list_link_active(&hdr->b_arc_node)) { - ASSERT(refcount_is_zero(&hdr->b_refcnt)); - atomic_add_64(&hdr->b_state->arcs_lsize[type], size); + atomic_add_64(&hdr->b_l1hdr.b_state->arcs_size, size); + if (list_link_active(&hdr->b_l1hdr.b_arc_node)) { + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + atomic_add_64(&hdr->b_l1hdr.b_state->arcs_lsize[type], + size); } /* * If we are growing the cache, and we are adding anonymous * data, and we have outgrown arc_p, update arc_p */ if (!zfs_arc_p_aggressive_disable && - arc_size < arc_c && hdr->b_state == arc_anon && + arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon && arc_anon->arcs_size + arc_mru->arcs_size > arc_p) arc_p = MIN(arc_c, arc_p + size); } @@ -2811,20 +3034,21 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) clock_t now; ASSERT(MUTEX_HELD(hash_lock)); + ASSERT(HDR_HAS_L1HDR(hdr)); - if (hdr->b_state == arc_anon) { + if (hdr->b_l1hdr.b_state == arc_anon) { /* * This buffer is not in the cache, and does not * appear in our "ghost" list. Add the new buffer * to the MRU state. */ - ASSERT(hdr->b_arc_access == 0); - hdr->b_arc_access = ddi_get_lbolt(); + ASSERT0(hdr->b_l1hdr.b_arc_access); + hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); arc_change_state(arc_mru, hdr, hash_lock); - } else if (hdr->b_state == arc_mru) { + } else if (hdr->b_l1hdr.b_state == arc_mru) { now = ddi_get_lbolt(); /* @@ -2835,15 +3059,16 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) * - move the buffer to the head of the list if this is * another prefetch (to make it less likely to be evicted). */ - if ((hdr->b_flags & ARC_FLAG_PREFETCH) != 0) { - if (refcount_count(&hdr->b_refcnt) == 0) { - ASSERT(list_link_active(&hdr->b_arc_node)); + if (HDR_PREFETCH(hdr)) { + if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { + ASSERT(list_link_active( + &hdr->b_l1hdr.b_arc_node)); } else { hdr->b_flags &= ~ARC_FLAG_PREFETCH; - atomic_inc_32(&hdr->b_mru_hits); + atomic_inc_32(&hdr->b_l1hdr.b_mru_hits); ARCSTAT_BUMP(arcstat_mru_hits); } - hdr->b_arc_access = now; + hdr->b_l1hdr.b_arc_access = now; return; } @@ -2852,19 +3077,20 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) * but it is still in the cache. Move it to the MFU * state. */ - if (ddi_time_after(now, hdr->b_arc_access + ARC_MINTIME)) { + if (ddi_time_after(now, hdr->b_l1hdr.b_arc_access + + ARC_MINTIME)) { /* * More than 125ms have passed since we * instantiated this buffer. Move it to the * most frequently used state. */ - hdr->b_arc_access = now; + hdr->b_l1hdr.b_arc_access = now; DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); arc_change_state(arc_mfu, hdr, hash_lock); } - atomic_inc_32(&hdr->b_mru_hits); + atomic_inc_32(&hdr->b_l1hdr.b_mru_hits); ARCSTAT_BUMP(arcstat_mru_hits); - } else if (hdr->b_state == arc_mru_ghost) { + } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) { arc_state_t *new_state; /* * This buffer has been "accessed" recently, but @@ -2872,9 +3098,9 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) * MFU state. */ - if (hdr->b_flags & ARC_FLAG_PREFETCH) { + if (HDR_PREFETCH(hdr)) { new_state = arc_mru; - if (refcount_count(&hdr->b_refcnt) > 0) + if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) hdr->b_flags &= ~ARC_FLAG_PREFETCH; DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); } else { @@ -2882,12 +3108,12 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); } - hdr->b_arc_access = ddi_get_lbolt(); + hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); arc_change_state(new_state, hdr, hash_lock); - atomic_inc_32(&hdr->b_mru_ghost_hits); + atomic_inc_32(&hdr->b_l1hdr.b_mru_ghost_hits); ARCSTAT_BUMP(arcstat_mru_ghost_hits); - } else if (hdr->b_state == arc_mfu) { + } else if (hdr->b_l1hdr.b_state == arc_mfu) { /* * This buffer has been accessed more than once and is * still in the cache. Keep it in the MFU state. @@ -2897,14 +3123,14 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) * If it was a prefetch, we will explicitly move it to * the head of the list now. */ - if ((hdr->b_flags & ARC_FLAG_PREFETCH) != 0) { - ASSERT(refcount_count(&hdr->b_refcnt) == 0); - ASSERT(list_link_active(&hdr->b_arc_node)); + if ((HDR_PREFETCH(hdr)) != 0) { + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node)); } - atomic_inc_32(&hdr->b_mfu_hits); + atomic_inc_32(&hdr->b_l1hdr.b_mfu_hits); ARCSTAT_BUMP(arcstat_mfu_hits); - hdr->b_arc_access = ddi_get_lbolt(); - } else if (hdr->b_state == arc_mfu_ghost) { + hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); + } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) { arc_state_t *new_state = arc_mfu; /* * This buffer has been accessed more than once but has @@ -2912,31 +3138,32 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) * MFU state. */ - if (hdr->b_flags & ARC_FLAG_PREFETCH) { + if (HDR_PREFETCH(hdr)) { /* * This is a prefetch access... * move this block back to the MRU state. */ - ASSERT0(refcount_count(&hdr->b_refcnt)); + ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); new_state = arc_mru; } - hdr->b_arc_access = ddi_get_lbolt(); + hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); arc_change_state(new_state, hdr, hash_lock); - atomic_inc_32(&hdr->b_mfu_ghost_hits); + atomic_inc_32(&hdr->b_l1hdr.b_mfu_ghost_hits); ARCSTAT_BUMP(arcstat_mfu_ghost_hits); - } else if (hdr->b_state == arc_l2c_only) { + } else if (hdr->b_l1hdr.b_state == arc_l2c_only) { /* * This buffer is on the 2nd Level ARC. */ - hdr->b_arc_access = ddi_get_lbolt(); + hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); arc_change_state(arc_mfu, hdr, hash_lock); } else { - cmn_err(CE_PANIC, "invalid arc state 0x%p", hdr->b_state); + cmn_err(CE_PANIC, "invalid arc state 0x%p", + hdr->b_l1hdr.b_state); } } @@ -3005,11 +3232,11 @@ arc_read_done(zio_t *zio) } hdr->b_flags &= ~ARC_FLAG_L2_EVICTED; - if (l2arc_noprefetch && (hdr->b_flags & ARC_FLAG_PREFETCH)) + if (l2arc_noprefetch && HDR_PREFETCH(hdr)) hdr->b_flags &= ~ARC_FLAG_L2CACHE; /* byteswap if necessary */ - callback_list = hdr->b_acb; + callback_list = hdr->b_l1hdr.b_acb; ASSERT(callback_list != NULL); if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) { dmu_object_byteswap_t bswap = @@ -3023,7 +3250,8 @@ arc_read_done(zio_t *zio) arc_cksum_compute(buf, B_FALSE); arc_buf_watch(buf); - if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) { + if (hash_lock && zio->io_error == 0 && + hdr->b_l1hdr.b_state == arc_anon) { /* * Only call arc_access on anonymous buffers. This is because * if we've issued an I/O for an evicted buffer, we've already @@ -3045,24 +3273,25 @@ arc_read_done(zio_t *zio) abuf = NULL; } } - hdr->b_acb = NULL; + hdr->b_l1hdr.b_acb = NULL; hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; ASSERT(!HDR_BUF_AVAILABLE(hdr)); if (abuf == buf) { ASSERT(buf->b_efunc == NULL); - ASSERT(hdr->b_datacnt == 1); + ASSERT(hdr->b_l1hdr.b_datacnt == 1); hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; } - ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL); + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) || + callback_list != NULL); if (zio->io_error != 0) { hdr->b_flags |= ARC_FLAG_IO_ERROR; - if (hdr->b_state != arc_anon) + if (hdr->b_l1hdr.b_state != arc_anon) arc_change_state(arc_anon, hdr, hash_lock); if (HDR_IN_HASH_TABLE(hdr)) buf_hash_remove(hdr); - freeable = refcount_is_zero(&hdr->b_refcnt); + freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt); } /* @@ -3070,9 +3299,9 @@ arc_read_done(zio_t *zio) * that the hdr (and hence the cv) might be freed before we get to * the cv_broadcast(). */ - cv_broadcast(&hdr->b_cv); + cv_broadcast(&hdr->b_l1hdr.b_cv); - if (hash_lock) { + if (hash_lock != NULL) { mutex_exit(hash_lock); } else { /* @@ -3081,8 +3310,8 @@ arc_read_done(zio_t *zio) * moved to the anonymous state (so that it won't show up * in the cache). */ - ASSERT3P(hdr->b_state, ==, arc_anon); - freeable = refcount_is_zero(&hdr->b_refcnt); + ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); + freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt); } /* execute each callback and free its structure */ @@ -3145,14 +3374,14 @@ top: hdr = buf_hash_find(guid, bp, &hash_lock); } - if (hdr != NULL && hdr->b_datacnt > 0) { + if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_datacnt > 0) { *arc_flags |= ARC_FLAG_CACHED; if (HDR_IO_IN_PROGRESS(hdr)) { if (*arc_flags & ARC_FLAG_WAIT) { - cv_wait(&hdr->b_cv, hash_lock); + cv_wait(&hdr->b_l1hdr.b_cv, hash_lock); mutex_exit(hash_lock); goto top; } @@ -3170,8 +3399,8 @@ top: spa, NULL, NULL, NULL, zio_flags); ASSERT(acb->acb_done != NULL); - acb->acb_next = hdr->b_acb; - hdr->b_acb = acb; + acb->acb_next = hdr->b_l1hdr.b_acb; + hdr->b_l1hdr.b_acb = acb; add_reference(hdr, hash_lock, private); mutex_exit(hash_lock); goto out; @@ -3180,7 +3409,8 @@ top: goto out; } - ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); + ASSERT(hdr->b_l1hdr.b_state == arc_mru || + hdr->b_l1hdr.b_state == arc_mfu); if (done) { add_reference(hdr, hash_lock, private); @@ -3189,7 +3419,7 @@ top: * copy of the data so that we will be guaranteed * that arc_release() will always succeed. */ - buf = hdr->b_buf; + buf = hdr->b_l1hdr.b_buf; ASSERT(buf); ASSERT(buf->b_data); if (HDR_BUF_AVAILABLE(hdr)) { @@ -3200,7 +3430,7 @@ top: } } else if (*arc_flags & ARC_FLAG_PREFETCH && - refcount_count(&hdr->b_refcnt) == 0) { + refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { hdr->b_flags |= ARC_FLAG_PREFETCH; } DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); @@ -3211,8 +3441,8 @@ top: hdr->b_flags |= ARC_FLAG_L2COMPRESS; mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); - ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_FLAG_PREFETCH), - demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, + ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), + demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, hits); if (done) @@ -3224,7 +3454,7 @@ top: uint64_t addr = 0; boolean_t devw = B_FALSE; enum zio_compress b_compress = ZIO_COMPRESS_OFF; - uint64_t b_asize = 0; + int32_t b_asize = 0; /* * Gracefully handle a damaged logical block size as a @@ -3251,7 +3481,6 @@ top: if (!BP_IS_EMBEDDED(bp)) { hdr->b_dva = *BP_IDENTITY(bp); hdr->b_birth = BP_PHYSICAL_BIRTH(bp); - hdr->b_cksum0 = bp->blk_cksum.zc_word[0]; exists = buf_hash_insert(hdr, &hash_lock); } if (exists != NULL) { @@ -3275,11 +3504,20 @@ top: if (BP_GET_LEVEL(bp) > 0) hdr->b_flags |= ARC_FLAG_INDIRECT; } else { - /* this block is in the ghost cache */ - ASSERT(GHOST_STATE(hdr->b_state)); + /* + * This block is in the ghost cache. If it was L2-only + * (and thus didn't have an L1 hdr), we realloc the + * header to add an L1 hdr. + */ + if (!HDR_HAS_L1HDR(hdr)) { + hdr = arc_hdr_realloc(hdr, hdr_l2only_cache, + hdr_full_cache); + } + + ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - ASSERT0(refcount_count(&hdr->b_refcnt)); - ASSERT(hdr->b_buf == NULL); + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + ASSERT(hdr->b_l1hdr.b_buf == NULL); /* if this is a prefetch, we don't have a reference */ if (*arc_flags & ARC_FLAG_PREFETCH) @@ -3296,29 +3534,29 @@ top: buf->b_efunc = NULL; buf->b_private = NULL; buf->b_next = NULL; - hdr->b_buf = buf; - ASSERT(hdr->b_datacnt == 0); - hdr->b_datacnt = 1; + hdr->b_l1hdr.b_buf = buf; + ASSERT0(hdr->b_l1hdr.b_datacnt); + hdr->b_l1hdr.b_datacnt = 1; arc_get_data_buf(buf); arc_access(hdr, hash_lock); } - ASSERT(!GHOST_STATE(hdr->b_state)); + ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state)); acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); acb->acb_done = done; acb->acb_private = private; - ASSERT(hdr->b_acb == NULL); - hdr->b_acb = acb; + ASSERT(hdr->b_l1hdr.b_acb == NULL); + hdr->b_l1hdr.b_acb = acb; hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS; - if (hdr->b_l2hdr != NULL && - (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) { - devw = hdr->b_l2hdr->b_dev->l2ad_writing; - addr = hdr->b_l2hdr->b_daddr; - b_compress = hdr->b_l2hdr->b_compress; - b_asize = hdr->b_l2hdr->b_asize; + if (HDR_HAS_L2HDR(hdr) && + (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) { + devw = hdr->b_l2hdr.b_dev->l2ad_writing; + addr = hdr->b_l2hdr.b_daddr; + b_compress = HDR_GET_COMPRESS(hdr); + b_asize = hdr->b_l2hdr.b_asize; /* * Lock out device removal. */ @@ -3338,8 +3576,8 @@ top: DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, uint64_t, size, zbookmark_phys_t *, zb); ARCSTAT_BUMP(arcstat_misses); - ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_FLAG_PREFETCH), - demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, + ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), + demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, misses); if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { @@ -3352,14 +3590,14 @@ top: * also have invalidated the vdev. * 5. This isn't prefetch and l2arc_noprefetch is set. */ - if (hdr->b_l2hdr != NULL && + if (HDR_HAS_L2HDR(hdr) && !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { l2arc_read_callback_t *cb; DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_l2_hits); - atomic_inc_32(&hdr->b_l2hdr->b_hits); + atomic_inc_32(&hdr->b_l2hdr.b_hits); cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP); @@ -3481,8 +3719,9 @@ void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) { ASSERT(buf->b_hdr != NULL); - ASSERT(buf->b_hdr->b_state != arc_anon); - ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL); + ASSERT(buf->b_hdr->b_l1hdr.b_state != arc_anon); + ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt) || + func == NULL); ASSERT(buf->b_efunc == NULL); ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr)); @@ -3506,7 +3745,7 @@ arc_freed(spa_t *spa, const blkptr_t *bp) if (hdr == NULL) return; if (HDR_BUF_AVAILABLE(hdr)) { - arc_buf_t *buf = hdr->b_buf; + arc_buf_t *buf = hdr->b_l1hdr.b_buf; add_reference(hdr, hash_lock, FTAG); hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; mutex_exit(hash_lock); @@ -3564,17 +3803,19 @@ arc_clear_callback(arc_buf_t *buf) hdr = buf->b_hdr; ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); - ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt); - ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); + ASSERT3U(refcount_count(&hdr->b_l1hdr.b_refcnt), <, + hdr->b_l1hdr.b_datacnt); + ASSERT(hdr->b_l1hdr.b_state == arc_mru || + hdr->b_l1hdr.b_state == arc_mfu); buf->b_efunc = NULL; buf->b_private = NULL; - if (hdr->b_datacnt > 1) { + if (hdr->b_l1hdr.b_datacnt > 1) { mutex_exit(&buf->b_evict_lock); arc_buf_destroy(buf, FALSE, TRUE); } else { - ASSERT(buf == hdr->b_buf); + ASSERT(buf == hdr->b_l1hdr.b_buf); hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; mutex_exit(&buf->b_evict_lock); } @@ -3593,10 +3834,9 @@ arc_clear_callback(arc_buf_t *buf) void arc_release(arc_buf_t *buf, void *tag) { - arc_buf_hdr_t *hdr; - kmutex_t *hash_lock = NULL; - l2arc_buf_hdr_t *l2hdr; - uint64_t buf_size = 0; + kmutex_t *hash_lock; + arc_state_t *state; + arc_buf_hdr_t *hdr = buf->b_hdr; /* * It would be nice to assert that if it's DMU metadata (level > @@ -3605,56 +3845,89 @@ arc_release(arc_buf_t *buf, void *tag) */ mutex_enter(&buf->b_evict_lock); - hdr = buf->b_hdr; + + /* + * We don't grab the hash lock prior to this check, because if + * the buffer's header is in the arc_anon state, it won't be + * linked into the hash table. + */ + if (hdr->b_l1hdr.b_state == arc_anon) { + mutex_exit(&buf->b_evict_lock); + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + ASSERT(!HDR_IN_HASH_TABLE(hdr)); + ASSERT(!HDR_HAS_L2HDR(hdr)); + ASSERT(BUF_EMPTY(hdr)); + + ASSERT3U(hdr->b_l1hdr.b_datacnt, ==, 1); + ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1); + ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); + + ASSERT3P(buf->b_efunc, ==, NULL); + ASSERT3P(buf->b_private, ==, NULL); + + hdr->b_l1hdr.b_arc_access = 0; + arc_buf_thaw(buf); + + return; + } + + hash_lock = HDR_LOCK(hdr); + mutex_enter(hash_lock); + + /* + * This assignment is only valid as long as the hash_lock is + * held, we must be careful not to reference state or the + * b_state field after dropping the lock. + */ + state = hdr->b_l1hdr.b_state; + ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); + ASSERT3P(state, !=, arc_anon); /* this buffer is not on any list */ - ASSERT(refcount_count(&hdr->b_refcnt) > 0); + ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) > 0); - if (hdr->b_state == arc_anon) { - /* this buffer is already released */ - ASSERT(buf->b_efunc == NULL); - } else { - hash_lock = HDR_LOCK(hdr); - mutex_enter(hash_lock); - hdr = buf->b_hdr; - ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); - } + if (HDR_HAS_L2HDR(hdr)) { + ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize); + ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); - l2hdr = hdr->b_l2hdr; - if (l2hdr) { - mutex_enter(&l2arc_buflist_mtx); - hdr->b_l2hdr = NULL; - list_remove(l2hdr->b_dev->l2ad_buflist, hdr); + mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx); + list_remove(&hdr->b_l2hdr.b_dev->l2ad_buflist, hdr); + mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx); + + hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; } - buf_size = hdr->b_size; /* * Do we have more than one buf? */ - if (hdr->b_datacnt > 1) { + if (hdr->b_l1hdr.b_datacnt > 1) { arc_buf_hdr_t *nhdr; arc_buf_t **bufp; uint64_t blksz = hdr->b_size; uint64_t spa = hdr->b_spa; - arc_buf_contents_t type = hdr->b_type; + arc_buf_contents_t type = arc_buf_type(hdr); uint32_t flags = hdr->b_flags; - ASSERT(hdr->b_buf != buf || buf->b_next != NULL); + ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL); /* * Pull the data off of this hdr and attach it to * a new anonymous hdr. */ (void) remove_reference(hdr, hash_lock, tag); - bufp = &hdr->b_buf; + bufp = &hdr->b_l1hdr.b_buf; while (*bufp != buf) bufp = &(*bufp)->b_next; *bufp = buf->b_next; buf->b_next = NULL; - ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size); - atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size); - if (refcount_is_zero(&hdr->b_refcnt)) { - uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type]; + ASSERT3P(state, !=, arc_l2c_only); + ASSERT3U(state->arcs_size, >=, hdr->b_size); + atomic_add_64(&state->arcs_size, -hdr->b_size); + if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { + uint64_t *size; + + ASSERT3P(state, !=, arc_l2c_only); + size = &state->arcs_lsize[type]; ASSERT3U(*size, >=, hdr->b_size); atomic_add_64(size, -hdr->b_size); } @@ -3663,68 +3936,60 @@ arc_release(arc_buf_t *buf, void *tag) * We're releasing a duplicate user data buffer, update * our statistics accordingly. */ - if (hdr->b_type == ARC_BUFC_DATA) { + if (HDR_ISTYPE_DATA(hdr)) { ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); ARCSTAT_INCR(arcstat_duplicate_buffers_size, -hdr->b_size); } - hdr->b_datacnt -= 1; + hdr->b_l1hdr.b_datacnt -= 1; arc_cksum_verify(buf); arc_buf_unwatch(buf); mutex_exit(hash_lock); - nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); + nhdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); nhdr->b_size = blksz; nhdr->b_spa = spa; - nhdr->b_type = type; - nhdr->b_buf = buf; - nhdr->b_state = arc_anon; - nhdr->b_arc_access = 0; - nhdr->b_mru_hits = 0; - nhdr->b_mru_ghost_hits = 0; - nhdr->b_mfu_hits = 0; - nhdr->b_mfu_ghost_hits = 0; - nhdr->b_l2_hits = 0; + + nhdr->b_l1hdr.b_mru_hits = 0; + nhdr->b_l1hdr.b_mru_ghost_hits = 0; + nhdr->b_l1hdr.b_mfu_hits = 0; + nhdr->b_l1hdr.b_mfu_ghost_hits = 0; + nhdr->b_l1hdr.b_l2_hits = 0; nhdr->b_flags = flags & ARC_FLAG_L2_WRITING; - nhdr->b_l2hdr = NULL; - nhdr->b_datacnt = 1; + nhdr->b_flags |= arc_bufc_to_flags(type); + nhdr->b_flags |= ARC_FLAG_HAS_L1HDR; + + nhdr->b_l1hdr.b_buf = buf; + nhdr->b_l1hdr.b_datacnt = 1; + nhdr->b_l1hdr.b_state = arc_anon; + nhdr->b_l1hdr.b_arc_access = 0; nhdr->b_freeze_cksum = NULL; - (void) refcount_add(&nhdr->b_refcnt, tag); + + (void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag); buf->b_hdr = nhdr; mutex_exit(&buf->b_evict_lock); atomic_add_64(&arc_anon->arcs_size, blksz); } else { mutex_exit(&buf->b_evict_lock); - ASSERT(refcount_count(&hdr->b_refcnt) == 1); - ASSERT(!list_link_active(&hdr->b_arc_node)); + ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1); + /* protected by hash lock */ + ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - if (hdr->b_state != arc_anon) - arc_change_state(arc_anon, hdr, hash_lock); - hdr->b_arc_access = 0; - hdr->b_mru_hits = 0; - hdr->b_mru_ghost_hits = 0; - hdr->b_mfu_hits = 0; - hdr->b_mfu_ghost_hits = 0; - hdr->b_l2_hits = 0; - if (hash_lock) - mutex_exit(hash_lock); + hdr->b_l1hdr.b_mru_hits = 0; + hdr->b_l1hdr.b_mru_ghost_hits = 0; + hdr->b_l1hdr.b_mfu_hits = 0; + hdr->b_l1hdr.b_mfu_ghost_hits = 0; + hdr->b_l1hdr.b_l2_hits = 0; + arc_change_state(arc_anon, hdr, hash_lock); + hdr->b_l1hdr.b_arc_access = 0; + mutex_exit(hash_lock); buf_discard_identity(hdr); arc_buf_thaw(buf); } buf->b_efunc = NULL; buf->b_private = NULL; - - if (l2hdr) { - ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); - vdev_space_update(l2hdr->b_dev->l2ad_vdev, - -l2hdr->b_asize, 0, 0); - kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); - arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS); - ARCSTAT_INCR(arcstat_l2_size, -buf_size); - mutex_exit(&l2arc_buflist_mtx); - } } int @@ -3733,7 +3998,8 @@ arc_released(arc_buf_t *buf) int released; mutex_enter(&buf->b_evict_lock); - released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon); + released = (buf->b_data != NULL && + buf->b_hdr->b_l1hdr.b_state == arc_anon); mutex_exit(&buf->b_evict_lock); return (released); } @@ -3745,7 +4011,7 @@ arc_referenced(arc_buf_t *buf) int referenced; mutex_enter(&buf->b_evict_lock); - referenced = (refcount_count(&buf->b_hdr->b_refcnt)); + referenced = (refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt)); mutex_exit(&buf->b_evict_lock); return (referenced); } @@ -3758,7 +4024,9 @@ arc_write_ready(zio_t *zio) arc_buf_t *buf = callback->awcb_buf; arc_buf_hdr_t *hdr = buf->b_hdr; - ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt)); + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt)); + ASSERT(hdr->b_l1hdr.b_datacnt > 0); callback->awcb_ready(zio, buf, callback->awcb_private); /* @@ -3768,12 +4036,12 @@ arc_write_ready(zio_t *zio) * accounting for any re-write attempt. */ if (HDR_IO_IN_PROGRESS(hdr)) { - mutex_enter(&hdr->b_freeze_lock); + mutex_enter(&hdr->b_l1hdr.b_freeze_lock); if (hdr->b_freeze_cksum != NULL) { kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); hdr->b_freeze_cksum = NULL; } - mutex_exit(&hdr->b_freeze_lock); + mutex_exit(&hdr->b_l1hdr.b_freeze_lock); } arc_cksum_compute(buf, B_FALSE); hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS; @@ -3798,7 +4066,7 @@ arc_write_done(zio_t *zio) arc_buf_t *buf = callback->awcb_buf; arc_buf_hdr_t *hdr = buf->b_hdr; - ASSERT(hdr->b_acb == NULL); + ASSERT(hdr->b_l1hdr.b_acb == NULL); if (zio->io_error == 0) { if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { @@ -3806,7 +4074,6 @@ arc_write_done(zio_t *zio) } else { hdr->b_dva = *BP_IDENTITY(zio->io_bp); hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); - hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; } } else { ASSERT(BUF_EMPTY(hdr)); @@ -3827,7 +4094,7 @@ arc_write_done(zio_t *zio) arc_cksum_verify(buf); exists = buf_hash_insert(hdr, &hash_lock); - if (exists) { + if (exists != NULL) { /* * This can only happen if we overwrite for * sync-to-convergence, because we remove @@ -3837,7 +4104,8 @@ arc_write_done(zio_t *zio) if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) panic("bad overwrite, hdr=%p exists=%p", (void *)hdr, (void *)exists); - ASSERT(refcount_is_zero(&exists->b_refcnt)); + ASSERT(refcount_is_zero( + &exists->b_l1hdr.b_refcnt)); arc_change_state(arc_anon, exists, hash_lock); mutex_exit(hash_lock); arc_hdr_destroy(exists); @@ -3851,22 +4119,22 @@ arc_write_done(zio_t *zio) (void *)hdr, (void *)exists); } else { /* Dedup */ - ASSERT(hdr->b_datacnt == 1); - ASSERT(hdr->b_state == arc_anon); + ASSERT(hdr->b_l1hdr.b_datacnt == 1); + ASSERT(hdr->b_l1hdr.b_state == arc_anon); ASSERT(BP_GET_DEDUP(zio->io_bp)); ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); } } hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; /* if it's not anon, we are doing a scrub */ - if (!exists && hdr->b_state == arc_anon) + if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon) arc_access(hdr, hash_lock); mutex_exit(hash_lock); } else { hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; } - ASSERT(!refcount_is_zero(&hdr->b_refcnt)); + ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); callback->awcb_done(zio, buf, callback->awcb_private); kmem_free(callback, sizeof (arc_write_callback_t)); @@ -3886,8 +4154,9 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, ASSERT(ready != NULL); ASSERT(done != NULL); ASSERT(!HDR_IO_ERROR(hdr)); - ASSERT((hdr->b_flags & ARC_FLAG_IO_IN_PROGRESS) == 0); - ASSERT(hdr->b_acb == NULL); + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + ASSERT(hdr->b_l1hdr.b_acb == NULL); + ASSERT(hdr->b_l1hdr.b_datacnt > 0); if (l2arc) hdr->b_flags |= ARC_FLAG_L2CACHE; if (l2arc_compress) @@ -4102,25 +4371,35 @@ arc_init(void) mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); list_create(&arc_mru->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); arc_anon->arcs_state = ARC_STATE_ANON; arc_mru->arcs_state = ARC_STATE_MRU; @@ -4231,7 +4510,7 @@ arc_fini(void) buf_fini(); - ASSERT(arc_loaned_bytes == 0); + ASSERT0(arc_loaned_bytes); } /* @@ -4390,7 +4669,7 @@ l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) * 3. has an I/O in progress (it may be an incomplete read). * 4. is flagged not eligible (zfs property). */ - if (hdr->b_spa != spa_guid || hdr->b_l2hdr != NULL || + if (hdr->b_spa != spa_guid || HDR_HAS_L2HDR(hdr) || HDR_IO_IN_PROGRESS(hdr) || !HDR_L2CACHE(hdr)) return (B_FALSE); @@ -4443,20 +4722,6 @@ l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) return (next); } -static void -l2arc_hdr_stat_add(void) -{ - ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE); - ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE); -} - -static void -l2arc_hdr_stat_remove(void) -{ - ARCSTAT_INCR(arcstat_l2_hdr_size, -HDR_SIZE); - ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE); -} - /* * Cycle through L2ARC devices. This is how L2ARC load balances. * If a device is returned, this also returns holding the spa config lock. @@ -4553,7 +4818,6 @@ l2arc_write_done(zio_t *zio) l2arc_dev_t *dev; list_t *buflist; arc_buf_hdr_t *head, *hdr, *hdr_prev; - l2arc_buf_hdr_t *abl2; kmutex_t *hash_lock; int64_t bytes_dropped = 0; @@ -4563,7 +4827,7 @@ l2arc_write_done(zio_t *zio) ASSERT(dev != NULL); head = cb->l2wcb_head; ASSERT(head != NULL); - buflist = dev->l2ad_buflist; + buflist = &dev->l2ad_buflist; ASSERT(buflist != NULL); DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, l2arc_write_callback_t *, cb); @@ -4571,42 +4835,43 @@ l2arc_write_done(zio_t *zio) if (zio->io_error != 0) ARCSTAT_BUMP(arcstat_l2_writes_error); - mutex_enter(&l2arc_buflist_mtx); + mutex_enter(&dev->l2ad_mtx); /* * All writes completed, or an error was hit. */ for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) { hdr_prev = list_prev(buflist, hdr); - abl2 = hdr->b_l2hdr; - - /* - * Release the temporary compressed buffer as soon as possible. - */ - if (abl2->b_compress != ZIO_COMPRESS_OFF) - l2arc_release_cdata_buf(hdr); hash_lock = HDR_LOCK(hdr); if (!mutex_tryenter(hash_lock)) { /* * This buffer misses out. It may be in a stage - * of eviction. Its ARC_L2_WRITING flag will be + * of eviction. Its ARC_FLAG_L2_WRITING flag will be * left set, denying reads to this buffer. */ ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss); continue; } + /* + * It's possible that this buffer got evicted from the L1 cache + * before we grabbed the vdev + hash locks, in which case + * arc_hdr_realloc freed b_tmp_cdata for us if it was allocated. + * Only free the buffer if we still have an L1 hdr. + */ + if (HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_tmp_cdata != NULL && + HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) + l2arc_release_cdata_buf(hdr); + if (zio->io_error != 0) { /* * Error - drop L2ARC entry. */ list_remove(buflist, hdr); - ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize); - bytes_dropped += abl2->b_asize; - hdr->b_l2hdr = NULL; - kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); - arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS); + hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; + + ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize); ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); } @@ -4620,8 +4885,9 @@ l2arc_write_done(zio_t *zio) atomic_inc_64(&l2arc_writes_done); list_remove(buflist, head); - kmem_cache_free(hdr_cache, head); - mutex_exit(&l2arc_buflist_mtx); + ASSERT(!HDR_HAS_L1HDR(head)); + kmem_cache_free(hdr_l2only_cache, head); + mutex_exit(&dev->l2ad_mtx); vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0); @@ -4759,16 +5025,12 @@ static void l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) { list_t *buflist; - l2arc_buf_hdr_t *abl2; arc_buf_hdr_t *hdr, *hdr_prev; kmutex_t *hash_lock; uint64_t taddr; int64_t bytes_evicted = 0; - buflist = dev->l2ad_buflist; - - if (buflist == NULL) - return; + buflist = &dev->l2ad_buflist; if (!all && dev->l2ad_first) { /* @@ -4791,7 +5053,7 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) uint64_t, taddr, boolean_t, all); top: - mutex_enter(&l2arc_buflist_mtx); + mutex_enter(&dev->l2ad_mtx); for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) { hdr_prev = list_prev(buflist, hdr); @@ -4801,7 +5063,7 @@ top: * Missed the hash lock. Retry. */ ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); - mutex_exit(&l2arc_buflist_mtx); + mutex_exit(&dev->l2ad_mtx); mutex_enter(hash_lock); mutex_exit(hash_lock); goto top; @@ -4817,9 +5079,9 @@ top: continue; } - if (!all && hdr->b_l2hdr != NULL && - (hdr->b_l2hdr->b_daddr > taddr || - hdr->b_l2hdr->b_daddr < dev->l2ad_hand)) { + if (!all && HDR_HAS_L2HDR(hdr) && + (hdr->b_l2hdr.b_daddr > taddr || + hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) { /* * We've evicted to the target address, * or the end of the device. @@ -4828,15 +5090,8 @@ top: break; } - if (HDR_FREE_IN_PROGRESS(hdr)) { - /* - * Already on the path to destruction. - */ - mutex_exit(hash_lock); - continue; - } - - if (hdr->b_state == arc_l2c_only) { + ASSERT(HDR_HAS_L2HDR(hdr)); + if (!HDR_HAS_L1HDR(hdr)) { ASSERT(!HDR_L2_READING(hdr)); /* * This doesn't exist in the ARC. Destroy. @@ -4846,6 +5101,8 @@ top: arc_change_state(arc_anon, hdr, hash_lock); arc_hdr_destroy(hdr); } else { + ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only); + ARCSTAT_BUMP(arcstat_l2_evict_l1cached); /* * Invalidate issued or about to be issued * reads, since we may be about to write @@ -4859,26 +5116,18 @@ top: /* * Tell ARC this no longer exists in L2ARC. */ - if (hdr->b_l2hdr != NULL) { - abl2 = hdr->b_l2hdr; - ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize); - bytes_evicted += abl2->b_asize; - hdr->b_l2hdr = NULL; - kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); - arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS); - ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); - } + /* Tell ARC this no longer exists in L2ARC. */ + ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize); + ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); + hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; list_remove(buflist, hdr); - /* - * This may have been leftover after a - * failed write. - */ + /* This may have been leftover after a failed write. */ hdr->b_flags &= ~ARC_FLAG_L2_WRITING; } mutex_exit(hash_lock); } - mutex_exit(&l2arc_buflist_mtx); + mutex_exit(&dev->l2ad_mtx); vdev_space_update(dev->l2ad_vdev, -bytes_evicted, 0, 0); dev->l2ad_evict = taddr; @@ -4920,8 +5169,9 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, pio = NULL; write_sz = write_asize = write_psize = 0; full = B_FALSE; - head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); + head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE); head->b_flags |= ARC_FLAG_L2_WRITE_HEAD; + head->b_flags |= ARC_FLAG_HAS_L2HDR; /* * We will want to try to compress buffers that are at least 2x the @@ -4932,7 +5182,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, /* * Copy buffers for L2ARC writing. */ - mutex_enter(&l2arc_buflist_mtx); + mutex_enter(&dev->l2ad_mtx); for (try = 0; try <= 3; try++) { uint64_t passed_sz = 0; @@ -4954,7 +5204,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, headroom = (headroom * l2arc_headroom_boost) / 100; for (; hdr; hdr = hdr_prev) { - l2arc_buf_hdr_t *l2hdr; kmutex_t *hash_lock; uint64_t buf_sz; @@ -4997,7 +5246,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, * l2arc_write_done() can find where the * write buffers begin without searching. */ - list_insert_head(dev->l2ad_buflist, head); + list_insert_head(&dev->l2ad_buflist, head); cb = kmem_alloc(sizeof (l2arc_write_callback_t), KM_SLEEP); @@ -5010,37 +5259,33 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, /* * Create and add a new L2ARC header. */ - l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), - KM_PUSHPAGE); - l2hdr->b_dev = dev; - arc_space_consume(L2HDR_SIZE, ARC_SPACE_L2HDRS); - + hdr->b_l2hdr.b_dev = dev; + arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); hdr->b_flags |= ARC_FLAG_L2_WRITING; - /* * Temporarily stash the data buffer in b_tmp_cdata. * The subsequent write step will pick it up from - * there. This is because can't access hdr->b_buf + * there. This is because can't access b_l1hdr.b_buf * without holding the hash_lock, which we in turn * can't access without holding the ARC list locks * (which we want to avoid during compression/writing) */ - l2hdr->b_compress = ZIO_COMPRESS_OFF; - l2hdr->b_asize = hdr->b_size; - l2hdr->b_tmp_cdata = hdr->b_buf->b_data; - l2hdr->b_hits = 0; + HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF); + hdr->b_l2hdr.b_asize = hdr->b_size; + hdr->b_l2hdr.b_hits = 0; + hdr->b_l1hdr.b_tmp_cdata = hdr->b_l1hdr.b_buf->b_data; buf_sz = hdr->b_size; - hdr->b_l2hdr = l2hdr; + hdr->b_flags |= ARC_FLAG_HAS_L2HDR; - list_insert_head(dev->l2ad_buflist, hdr); + list_insert_head(&dev->l2ad_buflist, hdr); /* * Compute and store the buffer cksum before * writing. On debug the cksum is verified first. */ - arc_cksum_verify(hdr->b_buf); - arc_cksum_compute(hdr->b_buf, B_TRUE); + arc_cksum_verify(hdr->b_l1hdr.b_buf); + arc_cksum_compute(hdr->b_l1hdr.b_buf, B_TRUE); mutex_exit(hash_lock); @@ -5056,8 +5301,9 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, /* No buffers selected for writing? */ if (pio == NULL) { ASSERT0(write_sz); - mutex_exit(&l2arc_buflist_mtx); - kmem_cache_free(hdr_cache, head); + mutex_exit(&dev->l2ad_mtx); + ASSERT(!HDR_HAS_L1HDR(head)); + kmem_cache_free(hdr_l2only_cache, head); return (0); } @@ -5066,24 +5312,22 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, * and work backwards, retracing the course of the buffer selector * loop above. */ - for (hdr = list_prev(dev->l2ad_buflist, head); hdr; - hdr = list_prev(dev->l2ad_buflist, hdr)) { - l2arc_buf_hdr_t *l2hdr; + for (hdr = list_prev(&dev->l2ad_buflist, head); hdr; + hdr = list_prev(&dev->l2ad_buflist, hdr)) { uint64_t buf_sz; /* * We shouldn't need to lock the buffer here, since we flagged * it as ARC_FLAG_L2_WRITING in the previous step, but we must * take care to only access its L2 cache parameters. In - * particular, hdr->b_buf may be invalid by now due to + * particular, hdr->l1hdr.b_buf may be invalid by now due to * ARC eviction. */ - l2hdr = hdr->b_l2hdr; - l2hdr->b_daddr = dev->l2ad_hand; + hdr->b_l2hdr.b_daddr = dev->l2ad_hand; - if (!l2arc_nocompress && (hdr->b_flags & ARC_FLAG_L2COMPRESS) && - l2hdr->b_asize >= buf_compress_minsz) { - if (l2arc_compress_buf(l2hdr)) { + if ((!l2arc_nocompress && HDR_L2COMPRESS(hdr)) && + hdr->b_l2hdr.b_asize >= buf_compress_minsz) { + if (l2arc_compress_buf(hdr)) { /* * If compression succeeded, enable headroom * boost on the next scan cycle. @@ -5096,8 +5340,8 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, * Pick up the buffer data we had previously stashed away * (and now potentially also compressed). */ - buf_data = l2hdr->b_tmp_cdata; - buf_sz = l2hdr->b_asize; + buf_data = hdr->b_l1hdr.b_tmp_cdata; + buf_sz = hdr->b_l2hdr.b_asize; /* Compression may have squashed the buffer to zero length. */ if (buf_sz != 0) { @@ -5122,7 +5366,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, } } - mutex_exit(&l2arc_buflist_mtx); + mutex_exit(&dev->l2ad_mtx); ASSERT3U(write_asize, <=, target_sz); ARCSTAT_BUMP(arcstat_l2_writes_sent); @@ -5150,7 +5394,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, /* * Compresses an L2ARC buffer. - * The data to be compressed must be prefilled in l2hdr->b_tmp_cdata and its + * The data to be compressed must be prefilled in l1hdr.b_tmp_cdata and its * size in l2hdr->b_asize. This routine tries to compress the data and * depending on the compression result there are three possible outcomes: * *) The buffer was incompressible. The original l2hdr contents were left @@ -5168,17 +5412,24 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, * buffer was incompressible). */ static boolean_t -l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr) +l2arc_compress_buf(arc_buf_hdr_t *hdr) { void *cdata; size_t csize, len, rounded; + l2arc_buf_hdr_t *l2hdr; - ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF); - ASSERT(l2hdr->b_tmp_cdata != NULL); + ASSERT(HDR_HAS_L2HDR(hdr)); + + l2hdr = &hdr->b_l2hdr; + + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF); + ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL); len = l2hdr->b_asize; cdata = zio_data_buf_alloc(len); - csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata, + ASSERT3P(cdata, !=, NULL); + csize = zio_compress_data(ZIO_COMPRESS_LZ4, hdr->b_l1hdr.b_tmp_cdata, cdata, l2hdr->b_asize); rounded = P2ROUNDUP(csize, (size_t)SPA_MINBLOCKSIZE); @@ -5190,9 +5441,9 @@ l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr) if (csize == 0) { /* zero block, indicate that there's nothing to write */ zio_data_buf_free(cdata, len); - l2hdr->b_compress = ZIO_COMPRESS_EMPTY; + HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_EMPTY); l2hdr->b_asize = 0; - l2hdr->b_tmp_cdata = NULL; + hdr->b_l1hdr.b_tmp_cdata = NULL; ARCSTAT_BUMP(arcstat_l2_compress_zeros); return (B_TRUE); } else if (csize > 0 && csize < len) { @@ -5200,9 +5451,9 @@ l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr) * Compression succeeded, we'll keep the cdata around for * writing and release it afterwards. */ - l2hdr->b_compress = ZIO_COMPRESS_LZ4; + HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_LZ4); l2hdr->b_asize = csize; - l2hdr->b_tmp_cdata = cdata; + hdr->b_l1hdr.b_tmp_cdata = cdata; ARCSTAT_BUMP(arcstat_l2_compress_successes); return (B_TRUE); } else { @@ -5250,9 +5501,9 @@ l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c) * need to fill its io_data after we're done restoring the * buffer's contents. */ - ASSERT(hdr->b_buf != NULL); - bzero(hdr->b_buf->b_data, hdr->b_size); - zio->io_data = zio->io_orig_data = hdr->b_buf->b_data; + ASSERT(hdr->b_l1hdr.b_buf != NULL); + bzero(hdr->b_l1hdr.b_buf->b_data, hdr->b_size); + zio->io_data = zio->io_orig_data = hdr->b_l1hdr.b_buf->b_data; } else { ASSERT(zio->io_data != NULL); /* @@ -5287,17 +5538,17 @@ l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c) static void l2arc_release_cdata_buf(arc_buf_hdr_t *hdr) { - l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr; - - if (l2hdr->b_compress == ZIO_COMPRESS_LZ4) { + ASSERT(HDR_HAS_L1HDR(hdr)); + if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_EMPTY) { /* * If the data was compressed, then we've allocated a * temporary buffer for it, so now we need to release it. */ - ASSERT(l2hdr->b_tmp_cdata != NULL); - zio_data_buf_free(l2hdr->b_tmp_cdata, hdr->b_size); + ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL); + zio_data_buf_free(hdr->b_l1hdr.b_tmp_cdata, + hdr->b_size); } - l2hdr->b_tmp_cdata = NULL; + hdr->b_l1hdr.b_tmp_cdata = NULL; } /* @@ -5442,13 +5693,13 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd) adddev->l2ad_writing = B_FALSE; list_link_init(&adddev->l2ad_node); + mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL); /* * This is a list of all ARC buffers that are still valid on the * device. */ - adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP); - list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l2node)); + list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node)); vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); @@ -5494,8 +5745,8 @@ l2arc_remove_vdev(vdev_t *vd) * Clear all buflists and ARC references. L2ARC device flush. */ l2arc_evict(remdev, 0, B_TRUE); - list_destroy(remdev->l2ad_buflist); - kmem_free(remdev->l2ad_buflist, sizeof (list_t)); + list_destroy(&remdev->l2ad_buflist); + mutex_destroy(&remdev->l2ad_mtx); kmem_free(remdev, sizeof (l2arc_dev_t)); } @@ -5510,7 +5761,6 @@ l2arc_init(void) mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL); mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); l2arc_dev_list = &L2ARC_dev_list; @@ -5535,7 +5785,6 @@ l2arc_fini(void) mutex_destroy(&l2arc_feed_thr_lock); cv_destroy(&l2arc_feed_thr_cv); mutex_destroy(&l2arc_dev_mtx); - mutex_destroy(&l2arc_buflist_mtx); mutex_destroy(&l2arc_free_on_write_mtx); list_destroy(l2arc_dev_list);