diff --git a/include/sys/arc.h b/include/sys/arc.h index c92b3eee61..5839f87087 100644 --- a/include/sys/arc.h +++ b/include/sys/arc.h @@ -63,8 +63,15 @@ extern "C" { (hdr)->b_psize = ((x) >> SPA_MINBLOCKSHIFT); \ } while (0) +/* The asize in the header is only used by L2 cache */ +#define HDR_SET_ASIZE(hdr, x) do { \ + ASSERT(IS_P2ALIGNED((x), 1U << SPA_MINBLOCKSHIFT)); \ + (hdr)->b_asize = ((x) >> SPA_MINBLOCKSHIFT); \ +} while (0) + #define HDR_GET_LSIZE(hdr) ((hdr)->b_lsize << SPA_MINBLOCKSHIFT) #define HDR_GET_PSIZE(hdr) ((hdr)->b_psize << SPA_MINBLOCKSHIFT) +#define HDR_GET_ASIZE(hdr) ((hdr)->b_asize << SPA_MINBLOCKSHIFT) typedef struct arc_buf_hdr arc_buf_hdr_t; typedef struct arc_buf arc_buf_t; @@ -323,8 +330,10 @@ void arc_freed(spa_t *spa, const blkptr_t *bp); int arc_cached(spa_t *spa, const blkptr_t *bp); void arc_flush(spa_t *spa, boolean_t retry); +void arc_flush_async(spa_t *spa); void arc_tempreserve_clear(uint64_t reserve); int arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg); +boolean_t arc_async_flush_guid_inuse(uint64_t load_guid); uint64_t arc_all_memory(void); uint64_t arc_default_max(uint64_t min, uint64_t allmem); diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h index 01693d72dd..4ecd7036db 100644 --- a/include/sys/arc_impl.h +++ b/include/sys/arc_impl.h @@ -378,8 +378,8 @@ typedef struct l2arc_lb_ptr_buf { * L2ARC Internals */ typedef struct l2arc_dev { - vdev_t *l2ad_vdev; /* vdev */ - spa_t *l2ad_spa; /* spa */ + vdev_t *l2ad_vdev; /* can be NULL during remove */ + spa_t *l2ad_spa; /* can be NULL during remove */ uint64_t l2ad_hand; /* next write location */ uint64_t l2ad_start; /* first addr on device */ uint64_t l2ad_end; /* last addr on device */ @@ -475,8 +475,8 @@ struct arc_buf_hdr { arc_buf_contents_t b_type; uint8_t b_complevel; - uint8_t b_reserved1; /* used for 4 byte alignment */ - uint16_t b_reserved2; /* used for 4 byte alignment */ + uint8_t b_reserved1; /* used for 4 byte alignment */ + uint16_t b_asize; /* alignment or L2-only asize */ arc_buf_hdr_t *b_hash_next; arc_flags_t b_flags; diff --git a/include/sys/spa.h b/include/sys/spa.h index aa66d489ef..97d08ae5fa 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -1090,6 +1090,7 @@ extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid); extern char *spa_strdup(const char *); extern void spa_strfree(char *); extern uint64_t spa_generate_guid(spa_t *spa); +extern uint64_t spa_generate_load_guid(void); extern void snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp); extern void spa_freeze(spa_t *spa); extern int spa_change_guid(spa_t *spa, const uint64_t *guidp); diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 714a30e863..596cb6c2f9 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -26,7 +26,7 @@ * Copyright (c) 2017, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2019, loli10K . All rights reserved. * Copyright (c) 2020, George Amanakis. All rights reserved. - * Copyright (c) 2019, 2023, Klara Inc. + * Copyright (c) 2019, 2024, Klara Inc. * Copyright (c) 2019, Allan Jude * Copyright (c) 2020, The FreeBSD Foundation [1] * @@ -464,6 +464,9 @@ static uint_t zfs_arc_lotsfree_percent = 10; */ static int zfs_arc_prune_task_threads = 1; +/* Used by spa_export/spa_destroy to flush the arc asynchronously */ +static taskq_t *arc_flush_taskq; + /* The 7 states: */ arc_state_t ARC_anon; arc_state_t ARC_mru; @@ -772,6 +775,23 @@ static buf_hash_table_t buf_hash_table; uint64_t zfs_crc64_table[256]; +/* + * Asynchronous ARC flush + * + * We track these in a list for arc_async_flush_guid_inuse() + */ +static list_t arc_async_flush_list; +static kmutex_t arc_async_flush_lock; + +typedef struct arc_async_flush { + uint64_t af_spa_guid; + taskqid_t af_task_id; + list_node_t af_node; +} arc_async_flush_t; + +static unsigned int arc_async_flush_init_spa_list(uint64_t spa_list[], + unsigned int list_len); + /* * Level 2 ARC */ @@ -1718,6 +1738,8 @@ arc_buf_alloc_l2only(size_t size, arc_buf_contents_t type, l2arc_dev_t *dev, arc_buf_hdr_t *hdr; ASSERT(size != 0); + ASSERT(dev->l2ad_vdev != NULL); + hdr = kmem_cache_alloc(hdr_l2only_cache, KM_SLEEP); hdr->b_birth = birth; hdr->b_type = type; @@ -1725,6 +1747,7 @@ arc_buf_alloc_l2only(size_t size, arc_buf_contents_t type, l2arc_dev_t *dev, arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L2HDR); HDR_SET_LSIZE(hdr, size); HDR_SET_PSIZE(hdr, psize); + HDR_SET_ASIZE(hdr, vdev_psize_to_asize(dev->l2ad_vdev, psize)); arc_hdr_set_compress(hdr, compress); hdr->b_complevel = complevel; if (protected) @@ -3508,16 +3531,17 @@ static void l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr, boolean_t state_only) { - l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; - l2arc_dev_t *dev = l2hdr->b_dev; uint64_t lsize = HDR_GET_LSIZE(hdr); uint64_t psize = HDR_GET_PSIZE(hdr); - uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize); + uint64_t asize = HDR_GET_ASIZE(hdr); arc_buf_contents_t type = hdr->b_type; int64_t lsize_s; int64_t psize_s; int64_t asize_s; + /* For L2 we expect the header's b_asize to be valid */ + ASSERT3U(asize, >=, psize); + if (incr) { lsize_s = lsize; psize_s = psize; @@ -3579,8 +3603,6 @@ arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr) { l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; l2arc_dev_t *dev = l2hdr->b_dev; - uint64_t psize = HDR_GET_PSIZE(hdr); - uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize); ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); ASSERT(HDR_HAS_L2HDR(hdr)); @@ -3588,7 +3610,10 @@ arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr) list_remove(&dev->l2ad_buflist, hdr); l2arc_hdr_arcstats_decrement(hdr); - vdev_space_update(dev->l2ad_vdev, -asize, 0, 0); + if (dev->l2ad_vdev != NULL) { + uint64_t asize = HDR_GET_ASIZE(hdr); + vdev_space_update(dev->l2ad_vdev, -asize, 0, 0); + } (void) zfs_refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); @@ -3854,9 +3879,20 @@ arc_set_need_free(void) } } +static boolean_t +arc_spa_is_list_member(uint64_t spa_guid, uint64_t spa_list[], + unsigned int spa_cnt) +{ + for (int i = 0; i < spa_cnt; i++) { + if (spa_list[i] == spa_guid) + return (B_TRUE); + } + return (B_FALSE); +} + static uint64_t arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, - uint64_t spa, uint64_t bytes) + uint64_t bytes, uint64_t spa_list[], unsigned int spa_cnt) { multilist_sublist_t *mls; uint64_t bytes_evicted = 0, real_evicted = 0; @@ -3898,8 +3934,13 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, if (hdr->b_spa == 0) continue; - /* we're only interested in evicting buffers of a certain spa */ - if (spa != 0 && hdr->b_spa != spa) { + /* + * Check if we're only interested in evicting buffers from + * a specifc list of spas. This would typically be from + * spas that are being unloaded. + */ + if (spa_cnt > 0 && + !arc_spa_is_list_member(hdr->b_spa, spa_list, spa_cnt)) { ARCSTAT_BUMP(arcstat_evict_skip); continue; } @@ -4035,8 +4076,8 @@ arc_state_free_markers(arc_buf_hdr_t **markers, int count) * the given arc state; which is used by arc_flush(). */ static uint64_t -arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa, - uint64_t bytes) +arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t bytes, + uint64_t spa_list[], unsigned int spa_cnt) { uint64_t total_evicted = 0; multilist_t *ml = &state->arcs_list[type]; @@ -4091,7 +4132,8 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa, break; bytes_evicted = arc_evict_state_impl(ml, sublist_idx, - markers[sublist_idx], spa, bytes_remaining); + markers[sublist_idx], bytes_remaining, spa_list, + spa_cnt); scan_evicted += bytes_evicted; total_evicted += bytes_evicted; @@ -4156,9 +4198,11 @@ arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type, boolean_t retry) { uint64_t evicted = 0; + uint64_t spa_list[1] = {spa}; while (zfs_refcount_count(&state->arcs_esize[type]) != 0) { - evicted += arc_evict_state(state, type, spa, ARC_EVICT_ALL); + evicted += arc_evict_state(state, type, ARC_EVICT_ALL, + spa_list, spa == 0 ? 0 : 1); if (!retry) break; @@ -4182,7 +4226,15 @@ arc_evict_impl(arc_state_t *state, arc_buf_contents_t type, int64_t bytes) if (bytes > 0 && zfs_refcount_count(&state->arcs_esize[type]) > 0) { delta = MIN(zfs_refcount_count(&state->arcs_esize[type]), bytes); - return (arc_evict_state(state, type, 0, delta)); + /* + * Create a list of guids from any active ARC async flushes. + * The underlying arc_evict_state() function will target + * only spa guids from this list when it is not empty. + */ + uint64_t spa_list[16]; + unsigned int spa_cnt = + arc_async_flush_init_spa_list(spa_list, 16); + return (arc_evict_state(state, type, delta, spa_list, spa_cnt)); } return (0); @@ -4375,20 +4427,10 @@ arc_evict(void) return (total_evicted); } -void -arc_flush(spa_t *spa, boolean_t retry) +static void +arc_flush_impl(uint64_t guid, boolean_t retry) { - uint64_t guid = 0; - - /* - * If retry is B_TRUE, a spa must not be specified since we have - * no good way to determine if all of a spa's buffers have been - * evicted from an arc state. - */ - ASSERT(!retry || spa == NULL); - - if (spa != NULL) - guid = spa_load_guid(spa); + ASSERT(!retry || guid == 0); (void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry); (void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry); @@ -4406,6 +4448,143 @@ arc_flush(spa_t *spa, boolean_t retry) (void) arc_flush_state(arc_uncached, guid, ARC_BUFC_METADATA, retry); } +void +arc_flush(spa_t *spa, boolean_t retry) +{ + /* + * If retry is B_TRUE, a spa must not be specified since we have + * no good way to determine if all of a spa's buffers have been + * evicted from an arc state. + */ + ASSERT(!retry || spa == NULL); + + arc_flush_impl(spa != NULL ? spa_load_guid(spa) : 0, retry); +} + +static arc_async_flush_t * +arc_async_flush_add(uint64_t spa_guid, taskqid_t task_id) +{ + arc_async_flush_t *af = kmem_alloc(sizeof (*af), KM_SLEEP); + af->af_spa_guid = spa_guid; + af->af_task_id = task_id; + list_link_init(&af->af_node); + + mutex_enter(&arc_async_flush_lock); + list_insert_tail(&arc_async_flush_list, af); + mutex_exit(&arc_async_flush_lock); + + return (af); +} + +static void +arc_async_flush_remove(uint64_t spa_guid, taskqid_t task_id) +{ + mutex_enter(&arc_async_flush_lock); + for (arc_async_flush_t *af = list_head(&arc_async_flush_list); + af != NULL; af = list_next(&arc_async_flush_list, af)) { + if (af->af_spa_guid == spa_guid && af->af_task_id == task_id) { + list_remove(&arc_async_flush_list, af); + kmem_free(af, sizeof (*af)); + break; + } + } + mutex_exit(&arc_async_flush_lock); +} + +static void +arc_flush_task(void *arg) +{ + arc_async_flush_t *af = arg; + hrtime_t start_time = gethrtime(); + uint64_t spa_guid = af->af_spa_guid; + + arc_flush_impl(spa_guid, B_FALSE); + arc_async_flush_remove(spa_guid, af->af_task_id); + + uint64_t elaspsed = NSEC2MSEC(gethrtime() - start_time); + if (elaspsed > 0) { + zfs_dbgmsg("spa %llu arc flushed in %llu ms", + (u_longlong_t)spa_guid, (u_longlong_t)elaspsed); + } +} + +/* + * ARC buffers use the spa's load guid and can continue to exist after + * the spa_t is gone (exported). The blocks are orphaned since each + * spa import has a different load guid. + * + * It's OK if the spa is re-imported while this asynchronous flush is + * still in progress. The new spa_load_guid will be different. + * + * Also, arc_fini will wait for any arc_flush_task to finish. + */ +void +arc_flush_async(spa_t *spa) +{ + uint64_t spa_guid = spa_load_guid(spa); + arc_async_flush_t *af = arc_async_flush_add(spa_guid, TASKQID_INVALID); + + /* + * Note that arc_flush_task() needs arc_async_flush_lock to remove af + * list node. So by holding the lock we avoid a race for af removal + * with our use here. + */ + mutex_enter(&arc_async_flush_lock); + taskqid_t tid = af->af_task_id = taskq_dispatch(arc_flush_taskq, + arc_flush_task, af, TQ_SLEEP); + mutex_exit(&arc_async_flush_lock); + + /* + * unlikely, but if we couldn't dispatch then use an inline flush + */ + if (tid == TASKQID_INVALID) { + arc_flush_impl(spa_guid, B_FALSE); + arc_async_flush_remove(spa_guid, TASKQID_INVALID); + } +} + +/* + * Check if a guid is still in-use as part of an async teardown task + */ +boolean_t +arc_async_flush_guid_inuse(uint64_t spa_guid) +{ + mutex_enter(&arc_async_flush_lock); + for (arc_async_flush_t *af = list_head(&arc_async_flush_list); + af != NULL; af = list_next(&arc_async_flush_list, af)) { + if (af->af_spa_guid == spa_guid) { + mutex_exit(&arc_async_flush_lock); + return (B_TRUE); + } + } + mutex_exit(&arc_async_flush_lock); + return (B_FALSE); +} + +/* + * Initialize a list of spa guids that are being flushed. + * + * Used by arc_evict_state() to target headers belonging to spas on this list. + */ +static unsigned int +arc_async_flush_init_spa_list(uint64_t spa_list[], unsigned int list_len) +{ + unsigned int init_cnt = 0; + + /* + * Iterate until the end of the list or array slots are full. + */ + mutex_enter(&arc_async_flush_lock); + for (arc_async_flush_t *af = list_head(&arc_async_flush_list); + init_cnt < list_len && af != NULL; + af = list_next(&arc_async_flush_list, af)) { + spa_list[init_cnt++] = af->af_spa_guid; + } + mutex_exit(&arc_async_flush_lock); + + return (init_cnt); +} + uint64_t arc_reduce_target_size(uint64_t to_free) { @@ -7744,6 +7923,12 @@ arc_init(void) arc_prune_taskq = taskq_create("arc_prune", zfs_arc_prune_task_threads, defclsyspri, 100, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); + list_create(&arc_async_flush_list, sizeof (arc_async_flush_t), + offsetof(arc_async_flush_t, af_node)); + mutex_init(&arc_async_flush_lock, NULL, MUTEX_DEFAULT, NULL); + arc_flush_taskq = taskq_create("arc_flush", 75, defclsyspri, + 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); + arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); @@ -7809,6 +7994,10 @@ arc_fini(void) arc_lowmem_fini(); #endif /* _KERNEL */ + /* Wait for any background flushes */ + taskq_wait(arc_flush_taskq); + taskq_destroy(arc_flush_taskq); + /* Use B_TRUE to ensure *all* buffers are evicted */ arc_flush(NULL, B_TRUE); @@ -7820,6 +8009,9 @@ arc_fini(void) taskq_wait(arc_prune_taskq); taskq_destroy(arc_prune_taskq); + list_destroy(&arc_async_flush_list); + mutex_destroy(&arc_async_flush_lock); + mutex_enter(&arc_prune_mtx); while ((p = list_remove_head(&arc_prune_list)) != NULL) { (void) zfs_refcount_remove(&p->p_refcnt, &arc_prune_list); @@ -8191,6 +8383,18 @@ l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) return (next); } +static boolean_t +l2arc_dev_invalid(const l2arc_dev_t *dev) +{ + /* + * We want to skip devices that are being rebuilt, trimmed, + * removed, or belong to a spa that is being exported. + */ + return (dev->l2ad_vdev == NULL || vdev_is_dead(dev->l2ad_vdev) || + dev->l2ad_rebuild || dev->l2ad_trim_all || + dev->l2ad_spa == NULL || dev->l2ad_spa->spa_is_exporting); +} + /* * Cycle through L2ARC devices. This is how L2ARC load balances. * If a device is returned, this also returns holding the spa config lock. @@ -8231,12 +8435,10 @@ l2arc_dev_get_next(void) break; ASSERT3P(next, !=, NULL); - } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild || - next->l2ad_trim_all || next->l2ad_spa->spa_is_exporting); + } while (l2arc_dev_invalid(next)); /* if we were unable to find any usable vdevs, return NULL */ - if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild || - next->l2ad_trim_all || next->l2ad_spa->spa_is_exporting) + if (l2arc_dev_invalid(next)) next = NULL; l2arc_dev_last = next; @@ -8366,6 +8568,8 @@ top: uint64_t psize = HDR_GET_PSIZE(hdr); l2arc_hdr_arcstats_decrement(hdr); + ASSERT(dev->l2ad_vdev != NULL); + bytes_dropped += vdev_psize_to_asize(dev->l2ad_vdev, psize); (void) zfs_refcount_remove_many(&dev->l2ad_alloc, @@ -8747,6 +8951,8 @@ l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev) if (dev->l2ad_log_entries == 0) { return (0); } else { + ASSERT(dev->l2ad_vdev != NULL); + uint64_t log_entries = write_sz >> SPA_MINBLOCKSHIFT; uint64_t log_blocks = (log_entries + @@ -8775,6 +8981,9 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) vdev_t *vd = dev->l2ad_vdev; boolean_t rerun; + ASSERT(vd != NULL || all); + ASSERT(dev->l2ad_spa != NULL || all); + buflist = &dev->l2ad_buflist; top: @@ -8867,7 +9076,8 @@ retry: if (!all && l2arc_log_blkptr_valid(dev, lb_ptr_buf->lb_ptr)) { break; } else { - vdev_space_update(vd, -asize, 0, 0); + if (vd != NULL) + vdev_space_update(vd, -asize, 0, 0); ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize); ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count); zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize, @@ -9281,6 +9491,8 @@ skip: hdr->b_l2hdr.b_hits = 0; hdr->b_l2hdr.b_arcs_state = hdr->b_l1hdr.b_state->arcs_state; + /* l2arc_hdr_arcstats_update() expects a valid asize */ + HDR_SET_ASIZE(hdr, asize); mutex_enter(&dev->l2ad_mtx); if (pio == NULL) { /* @@ -9446,8 +9658,10 @@ l2arc_feed_thread(void *unused) * held to prevent device removal. l2arc_dev_get_next() * will grab and release l2arc_dev_mtx. */ - if ((dev = l2arc_dev_get_next()) == NULL) + if ((dev = l2arc_dev_get_next()) == NULL || + dev->l2ad_spa == NULL) { continue; + } spa = dev->l2ad_spa; ASSERT3P(spa, !=, NULL); @@ -9532,6 +9746,12 @@ l2arc_rebuild_dev(l2arc_dev_t *dev, boolean_t reopen) uint64_t l2dhdr_asize = dev->l2ad_dev_hdr_asize; spa_t *spa = dev->l2ad_spa; + /* + * After a l2arc_remove_vdev(), the spa_t will no longer be valid + */ + if (spa == NULL) + return; + /* * The L2ARC has to hold at least the payload of one log block for * them to be restored (persistent L2ARC). The payload of a log block @@ -9699,39 +9919,19 @@ l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen) l2arc_rebuild_dev(dev, reopen); } -/* - * Remove a vdev from the L2ARC. - */ -void -l2arc_remove_vdev(vdev_t *vd) +typedef struct { + l2arc_dev_t *rva_l2arc_dev; + uint64_t rva_spa_gid; + uint64_t rva_vdev_gid; + taskqid_t rva_task_id; +} remove_vdev_args_t; + +static void +l2arc_device_teardown(void *arg) { - l2arc_dev_t *remdev = NULL; - - /* - * Find the device by vdev - */ - remdev = l2arc_vdev_get(vd); - ASSERT3P(remdev, !=, NULL); - - /* - * Cancel any ongoing or scheduled rebuild. - */ - mutex_enter(&l2arc_rebuild_thr_lock); - if (remdev->l2ad_rebuild_began == B_TRUE) { - remdev->l2ad_rebuild_cancel = B_TRUE; - while (remdev->l2ad_rebuild == B_TRUE) - cv_wait(&l2arc_rebuild_thr_cv, &l2arc_rebuild_thr_lock); - } - mutex_exit(&l2arc_rebuild_thr_lock); - - /* - * Remove device from global list - */ - mutex_enter(&l2arc_dev_mtx); - list_remove(l2arc_dev_list, remdev); - l2arc_dev_last = NULL; /* may have been invalidated */ - atomic_dec_64(&l2arc_ndev); - mutex_exit(&l2arc_dev_mtx); + remove_vdev_args_t *rva = arg; + l2arc_dev_t *remdev = rva->rva_l2arc_dev; + hrtime_t start_time = gethrtime(); /* * Clear all buflists and ARC references. L2ARC device flush. @@ -9746,6 +9946,93 @@ l2arc_remove_vdev(vdev_t *vd) zfs_refcount_destroy(&remdev->l2ad_lb_count); kmem_free(remdev->l2ad_dev_hdr, remdev->l2ad_dev_hdr_asize); vmem_free(remdev, sizeof (l2arc_dev_t)); + + uint64_t elaspsed = NSEC2MSEC(gethrtime() - start_time); + if (elaspsed > 0) { + zfs_dbgmsg("spa %llu, vdev %llu removed in %llu ms", + (u_longlong_t)rva->rva_spa_gid, + (u_longlong_t)rva->rva_vdev_gid, + (u_longlong_t)elaspsed); + } + + arc_async_flush_remove(rva->rva_spa_gid, rva->rva_task_id); + + kmem_free(rva, sizeof (remove_vdev_args_t)); +} + +/* + * Remove a vdev from the L2ARC. + */ +void +l2arc_remove_vdev(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + boolean_t asynchronous = spa->spa_state == POOL_STATE_EXPORTED || + spa->spa_state == POOL_STATE_DESTROYED; + + /* + * Find the device by vdev + */ + l2arc_dev_t *remdev = l2arc_vdev_get(vd); + ASSERT3P(remdev, !=, NULL); + + /* + * Save info for final teardown + */ + remove_vdev_args_t *rva = kmem_alloc(sizeof (remove_vdev_args_t), + KM_SLEEP); + rva->rva_l2arc_dev = remdev; + rva->rva_spa_gid = spa_guid(remdev->l2ad_spa); + rva->rva_vdev_gid = remdev->l2ad_vdev->vdev_guid; + + /* + * Cancel any ongoing or scheduled rebuild. + */ + mutex_enter(&l2arc_rebuild_thr_lock); + if (remdev->l2ad_rebuild_began == B_TRUE) { + remdev->l2ad_rebuild_cancel = B_TRUE; + while (remdev->l2ad_rebuild == B_TRUE) + cv_wait(&l2arc_rebuild_thr_cv, &l2arc_rebuild_thr_lock); + } else if (remdev->l2ad_rebuild == B_TRUE) { + /* Rebuild hasn't started yet so skip asynchronous teardown */ + asynchronous = B_FALSE; + } + mutex_exit(&l2arc_rebuild_thr_lock); + + /* + * Remove device from global list + */ + ASSERT(spa_config_held(spa, SCL_L2ARC, RW_WRITER) & SCL_L2ARC); + mutex_enter(&l2arc_dev_mtx); + list_remove(l2arc_dev_list, remdev); + l2arc_dev_last = NULL; /* may have been invalidated */ + atomic_dec_64(&l2arc_ndev); + + /* During a pool export spa & vdev will no longer be valid */ + if (asynchronous) { + remdev->l2ad_spa = NULL; + remdev->l2ad_vdev = NULL; + } + mutex_exit(&l2arc_dev_mtx); + + if (!asynchronous) { + l2arc_device_teardown(rva); + return; + } + + uint64_t spa_guid = spa_load_guid(spa); + arc_async_flush_t *af = arc_async_flush_add(spa_guid, TASKQID_INVALID); + + mutex_enter(&arc_async_flush_lock); + taskqid_t tid = taskq_dispatch(arc_flush_taskq, l2arc_device_teardown, + rva, TQ_SLEEP); + rva->rva_task_id = af->af_task_id = tid; + mutex_exit(&arc_async_flush_lock); + + if (tid == TASKQID_INVALID) { + l2arc_device_teardown(rva); + arc_async_flush_remove(spa_guid, TASKQID_INVALID); + } } void @@ -10003,6 +10290,7 @@ l2arc_rebuild(l2arc_dev_t *dev) mutex_enter(&l2arc_rebuild_thr_lock); if (dev->l2ad_rebuild_cancel) { dev->l2ad_rebuild = B_FALSE; + /* After signaling, the spa & vdev go away */ cv_signal(&l2arc_rebuild_thr_cv); mutex_exit(&l2arc_rebuild_thr_lock); err = SET_ERROR(ECANCELED); @@ -10042,7 +10330,15 @@ out: vmem_free(this_lb, sizeof (*this_lb)); vmem_free(next_lb, sizeof (*next_lb)); - if (!l2arc_rebuild_enabled) { + if (err == ECANCELED) { + /* + * In case the rebuild was canceled do not log to spa history + * log as the pool may be in the process of being removed. + */ + zfs_dbgmsg("L2ARC rebuild aborted, restored %llu blocks", + (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count)); + return (err); + } else if (!l2arc_rebuild_enabled) { spa_history_log_internal(spa, "L2ARC rebuild", NULL, "disabled"); } else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) > 0) { @@ -10060,13 +10356,6 @@ out: "no valid log blocks"); memset(l2dhdr, 0, dev->l2ad_dev_hdr_asize); l2arc_dev_hdr_update(dev); - } else if (err == ECANCELED) { - /* - * In case the rebuild was canceled do not log to spa history - * log as the pool may be in the process of being removed. - */ - zfs_dbgmsg("L2ARC rebuild aborted, restored %llu blocks", - (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count)); } else if (err != 0) { spa_history_log_internal(spa, "L2ARC rebuild", NULL, "aborted, restored %llu blocks", @@ -10354,6 +10643,7 @@ l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev) L2BLK_GET_STATE((le)->le_prop)); asize = vdev_psize_to_asize(dev->l2ad_vdev, L2BLK_GET_PSIZE((le)->le_prop)); + ASSERT3U(asize, ==, HDR_GET_ASIZE(hdr)); /* * vdev_space_update() has to be called before arc_hdr_destroy() to @@ -10383,6 +10673,8 @@ l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev) exists->b_l2hdr.b_daddr = le->le_daddr; exists->b_l2hdr.b_arcs_state = L2BLK_GET_STATE((le)->le_prop); + /* l2arc_hdr_arcstats_update() expects a valid asize */ + HDR_SET_ASIZE(exists, asize); mutex_enter(&dev->l2ad_mtx); list_insert_tail(&dev->l2ad_buflist, exists); (void) zfs_refcount_add_many(&dev->l2ad_alloc, diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index 342ec5c15c..d5af83659a 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -404,13 +404,21 @@ dsl_pool_close(dsl_pool_t *dp) taskq_destroy(dp->dp_zil_clean_taskq); spa_sync_tq_destroy(dp->dp_spa); - /* - * We can't set retry to TRUE since we're explicitly specifying - * a spa to flush. This is good enough; any missed buffers for - * this spa won't cause trouble, and they'll eventually fall - * out of the ARC just like any other unused buffer. - */ - arc_flush(dp->dp_spa, FALSE); + if (dp->dp_spa->spa_state == POOL_STATE_EXPORTED || + dp->dp_spa->spa_state == POOL_STATE_DESTROYED) { + /* + * On export/destroy perform the ARC flush asynchronously. + */ + arc_flush_async(dp->dp_spa); + } else { + /* + * We can't set retry to TRUE since we're explicitly specifying + * a spa to flush. This is good enough; any missed buffers for + * this spa won't cause trouble, and they'll eventually fall + * out of the ARC just like any other unused buffer. + */ + arc_flush(dp->dp_spa, FALSE); + } mmp_fini(dp->dp_spa); txg_fini(dp); diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 97191e7685..67be9fe1d3 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -1588,6 +1588,34 @@ spa_generate_guid(spa_t *spa) return (guid); } +static boolean_t +spa_load_guid_exists(uint64_t guid) +{ + avl_tree_t *t = &spa_namespace_avl; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + for (spa_t *spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) { + if (spa_load_guid(spa) == guid) + return (B_TRUE); + } + + return (arc_async_flush_guid_inuse(guid)); +} + +uint64_t +spa_generate_load_guid(void) +{ + uint64_t guid; + + do { + (void) random_get_pseudo_bytes((void *)&guid, + sizeof (guid)); + } while (guid == 0 || spa_load_guid_exists(guid)); + + return (guid); +} + void snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp) { diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 6ae0a14127..ec723febf8 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -631,7 +631,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) if (spa->spa_root_vdev == NULL) { ASSERT(ops == &vdev_root_ops); spa->spa_root_vdev = vd; - spa->spa_load_guid = spa_generate_guid(NULL); + spa->spa_load_guid = spa_generate_load_guid(); } if (guid == 0 && ops != &vdev_hole_ops) {