diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 349c208c52..04003bbdf2 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -3087,6 +3087,10 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, (void) printf(gettext("invalid label")); break; + case VDEV_AUX_POOL_USES_SHARED_L2ARC: + (void) printf(gettext("ignored in favor of cache devices in pool " ZFS_SHARED_L2ARC_POOL_NAME)); + break; + default: (void) printf(gettext("corrupted data")); break; @@ -3247,6 +3251,10 @@ print_import_config(status_cbdata_t *cb, const char *name, nvlist_t *nv, (void) printf(gettext("invalid label")); break; + case VDEV_AUX_POOL_USES_SHARED_L2ARC: + (void) printf(gettext("ignored in favor of cache devices in pool " ZFS_SHARED_L2ARC_POOL_NAME)); + break; + default: (void) printf(gettext("corrupted data")); break; diff --git a/include/sys/arc.h b/include/sys/arc.h index c92b3eee61..9b9d5124c2 100644 --- a/include/sys/arc.h +++ b/include/sys/arc.h @@ -337,6 +337,7 @@ void arc_fini(void); * Level 2 ARC */ +#define ZFS_SHARED_L2ARC_POOL_NAME "NTNX-fsvm-local-l2arc" void l2arc_add_vdev(spa_t *spa, vdev_t *vd); void l2arc_remove_vdev(vdev_t *vd); boolean_t l2arc_vdev_present(vdev_t *vd); diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index fc4f22cd53..5da4009314 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -997,6 +997,7 @@ typedef enum vdev_state { VDEV_STATE_REMOVED, /* Explicitly removed from system */ VDEV_STATE_CANT_OPEN, /* Tried to open, but failed */ VDEV_STATE_FAULTED, /* External request to fault device */ + VDEV_STATE_UNUSED, VDEV_STATE_DEGRADED, /* Replicated vdev with unhealthy kids */ VDEV_STATE_HEALTHY /* Presumed good */ } vdev_state_t; @@ -1026,6 +1027,7 @@ typedef enum vdev_aux { VDEV_AUX_SPLIT_POOL, /* vdev was split off into another pool */ VDEV_AUX_BAD_ASHIFT, /* vdev ashift is invalid */ VDEV_AUX_EXTERNAL_PERSIST, /* persistent forced fault */ + VDEV_AUX_POOL_USES_SHARED_L2ARC, VDEV_AUX_ACTIVE, /* vdev active on a different host */ VDEV_AUX_CHILDREN_OFFLINE, /* all children are offline */ VDEV_AUX_ASHIFT_TOO_BIG, /* vdev's min block size is too large */ diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index 14410b1531..b146ee27ac 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -208,6 +208,8 @@ zpool_state_to_name(vdev_state_t state, vdev_aux_t aux) return (gettext("DEGRADED")); case VDEV_STATE_HEALTHY: return (gettext("ONLINE")); + case VDEV_STATE_UNUSED: + return (gettext("UNUSED")); default: break; diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 714a30e863..b51ca678b9 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -823,6 +823,8 @@ typedef struct l2arc_read_callback { zbookmark_phys_t l2rcb_zb; /* original bookmark */ int l2rcb_flags; /* original flags */ abd_t *l2rcb_abd; /* temporary buffer */ + spa_t *l2rcb_primary_spa; /* original spa */ + boolean_t l2rcb_need_exit_scl_l2arc_primary_spa; } l2arc_read_callback_t; typedef struct l2arc_data_free { @@ -880,7 +882,7 @@ static uint32_t arc_bufc_to_flags(arc_buf_contents_t); static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); -static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); +static boolean_t l2arc_write_eligible(arc_buf_hdr_t *); static void l2arc_read_done(zio_t *); static void l2arc_do_free_on_write(void); static void l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr, @@ -3790,7 +3792,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, uint64_t *real_evicted) if (HDR_HAS_L2HDR(hdr)) { ARCSTAT_INCR(arcstat_evict_l2_cached, HDR_GET_LSIZE(hdr)); } else { - if (l2arc_write_eligible(hdr->b_spa, hdr)) { + if (l2arc_write_eligible(hdr)) { ARCSTAT_INCR(arcstat_evict_l2_eligible, HDR_GET_LSIZE(hdr)); @@ -5530,6 +5532,24 @@ arc_cached(spa_t *spa, const blkptr_t *bp) return (flags); } +static boolean_t +arc_read_l2arc_scl_l2arc_tryenter(vdev_t *l2arc_vdev) +{ + ASSERT(l2arc_vdev); + + /* we only use l2arc vdevs from the dedicated L2ARC pool */ + ASSERT(l2arc_vdev->vdev_spa); + ASSERT(strcmp(spa_name(l2arc_vdev->vdev_spa), ZFS_SHARED_L2ARC_POOL_NAME) == 0); + + return spa_config_tryenter(l2arc_vdev->vdev_spa, SCL_L2ARC, l2arc_vdev, RW_READER); +} + +static void +arc_read_l2arc_scl_l2arc_exit(vdev_t *l2arc_vdev) +{ + spa_config_exit(l2arc_vdev->vdev_spa, SCL_L2ARC, l2arc_vdev); +} + /* * "Read" the block at the specified DVA (in bp) via the * cache. If the block is found in the cache, invoke the provided @@ -5927,7 +5947,7 @@ top: * Lock out L2ARC device removal. */ if (vdev_is_dead(vd) || - !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) + !arc_read_l2arc_scl_l2arc_tryenter(vd)) vd = NULL; } @@ -5965,8 +5985,13 @@ top: } /* Check if the spa even has l2 configured */ - const boolean_t spa_has_l2 = l2arc_ndev != 0 && - spa->spa_l2cache.sav_count > 0; + /* XXX shared l2arc: if this pool is supposed to use the shared + * l2arc vdevs, the value should be the result of a check whether + * shared l2arc vdevs are available. + * On spa's that aren't supposed to use shared l2arc, bring back + * the original check from OpenZFS commit 666aa69f. + */ + const boolean_t spa_has_l2 = B_TRUE; if (vd != NULL && spa_has_l2 && !(l2arc_norw && devw)) { /* @@ -5983,16 +6008,15 @@ top: abd_t *abd; uint64_t asize; - DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); - ARCSTAT_BUMP(arcstat_l2_hits); - hdr->b_l2hdr.b_hits++; - cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP); cb->l2rcb_hdr = hdr; cb->l2rcb_bp = *bp; cb->l2rcb_zb = *zb; cb->l2rcb_flags = zio_flags; + cb->l2rcb_primary_spa = spa; + cb->l2rcb_need_exit_scl_l2arc_primary_spa = + B_FALSE; /* * When Compressed ARC is disabled, but the @@ -6018,6 +6042,38 @@ top: addr + asize <= vd->vdev_psize - VDEV_LABEL_END_SIZE); + /* + * Make sure the primary spa is still around + * for the fallback read that will be initiated + * by l2arc_read_done if the read from the l2arc + * vdev fails and it's ARC_FLAG_NOWAIT. + * l2arc_read_done is responsible for releasing. + * We use `cb` as the tag because unlike `vd` + * or `rzio`, we control its lifetime. + */ + if (spa != vd->vdev_spa) { + /* + * See comment in l2arc_read_done on how + * we are careful to avoid recursive + * calls to this function, which would + * result in an error. + */ + int ok = spa_config_tryenter(spa, + SCL_L2ARC, cb, RW_READER); + if (!ok) { + arc_read_l2arc_scl_l2arc_exit(vd); + /* XXX separate stat & probe?? */ + DTRACE_PROBE1(l2arc__miss, + arc_buf_hdr_t *, hdr); + ARCSTAT_BUMP(arcstat_l2_misses); + goto l2arc_read_error; + } + } + + DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); + ARCSTAT_BUMP(arcstat_l2_hits); + hdr->b_l2hdr.b_hits++; + /* * l2arc read. The SCL_L2ARC lock will be * released by l2arc_read_done(). @@ -6026,6 +6082,12 @@ top: */ ASSERT3U(arc_hdr_get_compress(hdr), !=, ZIO_COMPRESS_EMPTY); + /* + * NB: if pio != NULL and spa != vd->vdev_spa, + * then this zio tree spans across multiple spa's. + * A manual code audit at the time of writing showed + * that the zio code is fine with that. + */ rzio = zio_read_phys(pio, vd, addr, asize, abd, ZIO_CHECKSUM_OFF, @@ -6052,6 +6114,7 @@ top: if (zio_wait(rzio) == 0) goto out; +l2arc_read_error: /* l2arc read error; goto zio_read() */ if (hash_lock != NULL) mutex_enter(hash_lock); @@ -6061,11 +6124,11 @@ top: ARCSTAT_BUMP(arcstat_l2_misses); if (HDR_L2_WRITING(hdr)) ARCSTAT_BUMP(arcstat_l2_rw_clash); - spa_config_exit(spa, SCL_L2ARC, vd); + arc_read_l2arc_scl_l2arc_exit(vd); } } else { if (vd != NULL) - spa_config_exit(spa, SCL_L2ARC, vd); + arc_read_l2arc_scl_l2arc_exit(vd); /* * Only a spa with l2 should contribute to l2 @@ -8110,7 +8173,7 @@ arc_fini(void) */ static boolean_t -l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) +l2arc_write_eligible(arc_buf_hdr_t *hdr) { /* * A buffer is *not* eligible for the L2ARC if it: @@ -8119,7 +8182,7 @@ l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) * 3. has an I/O in progress (it may be an incomplete read). * 4. is flagged not eligible (zfs property). */ - if (hdr->b_spa != spa_guid || HDR_HAS_L2HDR(hdr) || + if (HDR_HAS_L2HDR(hdr) || HDR_IO_IN_PROGRESS(hdr) || !HDR_L2CACHE(hdr)) return (B_FALSE); @@ -8547,9 +8610,31 @@ error: } +static void +l2arc_fallback_read_done(zio_t *zio) +{ + l2arc_read_callback_t *cb = zio->io_private; + ASSERT3P(zio->io_spa, ==, cb->l2rcb_primary_spa); + if (cb->l2rcb_need_exit_scl_l2arc_primary_spa) { + /* + * Release SCL_L2ARC before arc_read_done to avoid recursively + * spa_config_tryenter if arc_read_done calls arc_read again. + */ + spa_config_exit(cb->l2rcb_primary_spa, SCL_L2ARC, cb); + } + arc_buf_hdr_t *hdr = cb->l2rcb_hdr; + zio->io_private = hdr; + kmem_free(cb, sizeof (l2arc_read_callback_t)); + arc_read_done(zio); +} + /* * A read to a cache device completed. Validate buffer contents before * handing over to the regular ARC routines. + * + * We take care of cleaning up any L2ARC read resources before calling + * arc_read_done to minimize overhead if the arc_read callbacks issue arc_read + * themselves. Ideally, the compiler turns the arc_read_done call into a jump. */ static void l2arc_read_done(zio_t *zio) @@ -8565,11 +8650,39 @@ l2arc_read_done(zio_t *zio) ASSERT3P(zio->io_vd, !=, NULL); ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); - spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); + ASSERT(strcmp(spa_name(zio->io_spa), ZFS_SHARED_L2ARC_POOL_NAME) == 0); ASSERT3P(cb, !=, NULL); hdr = cb->l2rcb_hdr; ASSERT3P(hdr, !=, NULL); + ASSERT(cb->l2rcb_primary_spa); + + boolean_t primay_spa_is_not_l2arc_spa = + zio->io_spa != cb->l2rcb_primary_spa; + /* Unblock L2ARC device removal. */ + spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); + if (primay_spa_is_not_l2arc_spa) { + /* We must keep holding SCL_L2ARC for l2rcb_primary_spa until + * after we have issued the fallback read, in order to prevent + * l2rcb_primary_spa from being exported (or worse, freed). + + * However, we must release it before we call the `done` + * callback that was passed to the original arc_read. The + * reason is that the callback could issue another arc_read, + * resulting in a recursive call to spa_config_tryenter(primary + * spa, SCL_L2ARC). That call would fail if we still held + * SCL_L2ARC, and the code + * would treat is like an L2ARC miss, whereas it's most likely + * a hit. + */ + /* XXX: we should actually use refcount_held to assert that `cb` + * still holds the config lock */ + ASSERT(spa_config_held(cb->l2rcb_primary_spa, SCL_L2ARC, + RW_READER)); + } else { + /* XXX: assert that `cb` is not holding SCL_L2ARC */ + } + hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); @@ -8622,10 +8735,27 @@ l2arc_read_done(zio_t *zio) */ ASSERT(zio->io_abd == hdr->b_l1hdr.b_pabd || (HDR_HAS_RABD(hdr) && zio->io_abd == hdr->b_crypt_hdr.b_rabd)); + /* + * Set the zio's bp to the one we're trying to fill from L2ARC. + * The bp contains the checksum, and both the arc_cksum_is_equal + * and the fallback zio_read depend on it being set correctly. + */ zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ zio->io_prop.zp_complevel = hdr->b_complevel; +#ifdef _KERNEL + if (0) { + zio_t *tmp = zio_unique_parent(zio); + cmn_err(CE_WARN, "zio->io_waiter=%p zio->io_spa=%s " + "zio_unique_parent(zio)=%p parent_is_godfather=%d " + "zio_unique_parent(zio)->io_spa=%s ", + zio->io_waiter, spa_name(zio->io_spa), + tmp, tmp ? !!(tmp->io_flags & ZIO_FLAG_GODFATHER) : -1, + tmp ? spa_name(tmp->io_spa) : ""); + } +#endif + valid_cksum = arc_cksum_is_equal(hdr, zio); /* @@ -8639,6 +8769,10 @@ l2arc_read_done(zio_t *zio) if (valid_cksum && tfm_error == 0 && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { mutex_exit(hash_lock); + if (primay_spa_is_not_l2arc_spa) { + spa_config_exit(cb->l2rcb_primary_spa, SCL_L2ARC, cb); + } + kmem_free(cb, sizeof (l2arc_read_callback_t)); zio->io_private = hdr; arc_read_done(zio); } else { @@ -8649,6 +8783,10 @@ l2arc_read_done(zio_t *zio) if (zio->io_error != 0) { ARCSTAT_BUMP(arcstat_l2_io_error); } else { + /* + * IO error ensures that arc_read does the fallback + * read for the case where zio->io_waiter != NULL. + */ zio->io_error = SET_ERROR(EIO); } if (!valid_cksum || tfm_error != 0) @@ -8656,20 +8794,62 @@ l2arc_read_done(zio_t *zio) /* * If there's no waiter, issue an async i/o to the primary - * storage now. If there *is* a waiter, the caller must - * issue the i/o in a context where it's OK to block. + * storage now. If there *is* a waiter, the caller must issue + * the i/o in a context where it's OK to block. + * + * The above in more technical terms: If the arc_read was + * ARC_FLAG_NOWAIT, we issued the zio_read_phys to read from + * the l2arc vdev using zio_nowait and immediately returned + * from arc_read. But arc_read is supposed to transparently + * fall back to the primary storage. So, for the + * ARC_FLAG_NOWAIT case, we do that here. However, if there + * *is* a waiter, it's the ARC_FLAG_WAIT case in which case + * arc_read does the fallback read itself. */ if (zio->io_waiter == NULL) { zio_t *pio = zio_unique_parent(zio); void *abd = (using_rdata) ? hdr->b_crypt_hdr.b_rabd : hdr->b_l1hdr.b_pabd; - ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); + /* + * The `pio` is the one that was passed to arc_read. + * We added `zio` as its (only) child. + * But arc_read can also be called with pio==NULL. + * In that case, zio_nowait sets zio's parent to + * spa_async_zio_root (aka the grandfather zio). + */ + if (pio) { + ASSERT3S(pio->io_child_type, ==, ZIO_CHILD_LOGICAL); + /* + * If we violated the follwoing assertion, we + * would be reading from the wrong spa in the + * fallback zio_read below. The caller of + * arc_read would likely observe an EIO due to + * invalid checksum. It depends on the caller + * what happens then. If the arc_read was for + * ZFS metadata it's likely that they have a + * VERIFY somewhere which would result in a + * kernel panic higher up the stack. If it's + * for user data, it's likely that we bubble up + * the error to userspace. + * + * Either way, the resulting behavior is + * misleading and would likely spark + * misdirected investigations into data + * integrity issues. So, just take the hit and + * do a VERIFY here to get a precise panic in + * case we have an implementation issue. + */ + VERIFY3P(pio->io_spa, ==, cb->l2rcb_primary_spa); + } - zio = zio_read(pio, zio->io_spa, zio->io_bp, - abd, zio->io_size, arc_read_done, - hdr, zio->io_priority, cb->l2rcb_flags, - &cb->l2rcb_zb); + /* NB: io_bp has been copied from cb->l2rcb_bp above */ + cb->l2rcb_need_exit_scl_l2arc_primary_spa = + primay_spa_is_not_l2arc_spa; + zio_t *fallback_read = zio_read(pio, + cb->l2rcb_primary_spa, zio->io_bp, + abd, zio->io_size, l2arc_fallback_read_done, cb, + zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb); /* * Original ZIO will be freed, so we need to update @@ -8681,13 +8861,22 @@ l2arc_read_done(zio_t *zio) acb->acb_zio_head = zio; mutex_exit(hash_lock); - zio_nowait(zio); + zio_nowait(fallback_read); + /* + * We release the SCL_L2ARC on l2rcb_primary_spa in + * l2arc_fallback_read_done. See comment in there, at + * at the beginning of the function, for why we can't + * release it here. + */ } else { mutex_exit(hash_lock); + if (primay_spa_is_not_l2arc_spa) { + spa_config_exit(cb->l2rcb_primary_spa, SCL_L2ARC, cb); + } + kmem_free(cb, sizeof (l2arc_read_callback_t)); + /* arc_read will do the fallback read */ } } - - kmem_free(cb, sizeof (l2arc_read_callback_t)); } /* @@ -8923,7 +9112,7 @@ retry: } if (!HDR_HAS_L1HDR(hdr)) { - ASSERT(!HDR_L2_READING(hdr)); + ASSERT(!HDR_L2_READING(hdr)); // why does this hold? /* * This doesn't exist in the ARC. Destroy. * arc_hdr_destroy() will call list_remove() @@ -9129,7 +9318,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) boolean_t full, from_head = !arc_warm; l2arc_write_callback_t *cb = NULL; zio_t *pio, *wzio; - uint64_t guid = spa_load_guid(spa); l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; ASSERT3P(dev->l2ad_vdev, !=, NULL); @@ -9199,7 +9387,7 @@ skip: break; } - if (!l2arc_write_eligible(guid, hdr)) { + if (!l2arc_write_eligible(hdr)) { mutex_exit(hash_lock); goto skip; } @@ -9452,6 +9640,12 @@ l2arc_feed_thread(void *unused) spa = dev->l2ad_spa; ASSERT3P(spa, !=, NULL); + /* + * We only allow add l2arc devices in pool NTNX_L2ARC_POOL_NAME + * to l2arc_dev_list. + */ + ASSERT0(strcmp(spa_name(spa), ZFS_SHARED_L2ARC_POOL_NAME)); + /* * If the pool is read-only then force the feed thread to * sleep a little longer. @@ -9611,6 +9805,8 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd) ASSERT(!l2arc_vdev_present(vd)); + VERIFY(strcmp(spa_name(spa), ZFS_SHARED_L2ARC_POOL_NAME) == 0); + /* * Create a new l2arc device entry. */ diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 099883ba26..933558a138 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -3482,7 +3482,28 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb, } else { ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size); } - ASSERT3P(zio->io_spa, ==, dpa->dpa_spa); + /* XXX: with shared l2arc, this ASSERT is incorrect. + * After conversation with Paul Dagnelie, who added it, + * we concluded that it's not necessary. + * The stack trace was : + * VERIFY3(zio->io_spa == dpa->dpa_spa) failed + * (ffff8fc81ba6c000== ffff8fc9c4aa8000) + * ^ l2arc spa ^primary spa + * + * dump_stack+0x6d/0x8b + * spl_dumpstack+0x29/0x2b [spl] + * spl_panic+0xd1/0xd3 [spl] + * dbuf_prefetch_indirect_done+0xdd/0x420 [zfs] + * arc_read_done+0x3e6/0x990 [zfs] + * l2arc_read_done+0x5a9/0x9a0 [zfs] + * zio_done+0x54d/0x1680 [zfs] + * zio_execute+0xe9/0x2e0 [zfs] + * taskq_thread+0x2ec/0x650 [spl] + * kthread+0x128/0x140 + * ret_from_fork+0x35/0x40 + * + * ASSERT3P(zio->io_spa, ==, dpa->dpa_spa); + */ dpa->dpa_dnode = NULL; } else if (dpa->dpa_dnode != NULL) { diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 1a68a09535..acf5ac127f 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -2413,6 +2413,20 @@ spa_load_l2cache(spa_t *spa) (void) vdev_validate_aux(vd); + if (!vdev_is_dead(vd) && + strcmp(spa_name(spa), ZFS_SHARED_L2ARC_POOL_NAME) != 0) { + cmn_err(CE_WARN, "pool '%s': l2arc devices outside of pool " ZFS_SHARED_L2ARC_POOL_NAME " will not be used and marked offline.", spa_name(spa)); + /* mark device as dead */ + vdev_set_state(vd, B_TRUE, VDEV_STATE_UNUSED, + VDEV_AUX_POOL_USES_SHARED_L2ARC); + /* + * Ensure that vdev_is_dead() returns true, so + * we don't l2arc_add_vdev below. The whole + * Shared L2ARC code relies on this. + */ + VERIFY(vdev_is_dead(vd)); + } + if (!vdev_is_dead(vd)) l2arc_add_vdev(spa, vd);