diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h index a38083714b..43818d4104 100644 --- a/include/sys/arc_impl.h +++ b/include/sys/arc_impl.h @@ -989,7 +989,7 @@ extern unsigned long zfs_arc_max; extern void arc_reduce_target_size(int64_t to_free); extern boolean_t arc_reclaim_needed(void); extern void arc_kmem_reap_soon(void); -extern void arc_wait_for_eviction(uint64_t); +extern void arc_wait_for_eviction(uint64_t, boolean_t); extern void arc_lowmem_init(void); extern void arc_lowmem_fini(void); diff --git a/module/os/freebsd/zfs/arc_os.c b/module/os/freebsd/zfs/arc_os.c index 3b8b11cff0..fddb1f0e87 100644 --- a/module/os/freebsd/zfs/arc_os.c +++ b/module/os/freebsd/zfs/arc_os.c @@ -233,7 +233,7 @@ arc_lowmem(void *arg __unused, int howto __unused) * with ARC reclaim thread. */ if (curproc == pageproc) - arc_wait_for_eviction(to_free); + arc_wait_for_eviction(to_free, B_FALSE); } void diff --git a/module/os/linux/zfs/arc_os.c b/module/os/linux/zfs/arc_os.c index c3e889d541..f96cd1271e 100644 --- a/module/os/linux/zfs/arc_os.c +++ b/module/os/linux/zfs/arc_os.c @@ -217,7 +217,7 @@ arc_shrinker_scan(struct shrinker *shrink, struct shrink_control *sc) * for the requested amount of data to be evicted. */ arc_reduce_target_size(ptob(sc->nr_to_scan)); - arc_wait_for_eviction(ptob(sc->nr_to_scan)); + arc_wait_for_eviction(ptob(sc->nr_to_scan), B_FALSE); if (current->reclaim_state != NULL) current->reclaim_state->reclaimed_slab += sc->nr_to_scan; diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 677b800292..a89a9e65d3 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -835,12 +835,13 @@ static kcondvar_t l2arc_rebuild_thr_cv; enum arc_hdr_alloc_flags { ARC_HDR_ALLOC_RDATA = 0x1, ARC_HDR_DO_ADAPT = 0x2, + ARC_HDR_USE_RESERVE = 0x4, }; -static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *, boolean_t); +static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *, int); static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *); -static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *, boolean_t); +static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *, int); static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, void *); static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *); static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag); @@ -1855,7 +1856,8 @@ arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb) * and then loan a buffer from it, rather than allocating a * linear buffer and wrapping it in an abd later. */ - cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, B_TRUE); + cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, + ARC_HDR_DO_ADAPT); tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr)); ret = zio_decompress_data(HDR_GET_COMPRESS(hdr), @@ -3170,7 +3172,6 @@ arc_hdr_alloc_abd(arc_buf_hdr_t *hdr, int alloc_flags) { uint64_t size; boolean_t alloc_rdata = ((alloc_flags & ARC_HDR_ALLOC_RDATA) != 0); - boolean_t do_adapt = ((alloc_flags & ARC_HDR_DO_ADAPT) != 0); ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); ASSERT(HDR_HAS_L1HDR(hdr)); @@ -3181,14 +3182,14 @@ arc_hdr_alloc_abd(arc_buf_hdr_t *hdr, int alloc_flags) size = HDR_GET_PSIZE(hdr); ASSERT3P(hdr->b_crypt_hdr.b_rabd, ==, NULL); hdr->b_crypt_hdr.b_rabd = arc_get_data_abd(hdr, size, hdr, - do_adapt); + alloc_flags); ASSERT3P(hdr->b_crypt_hdr.b_rabd, !=, NULL); ARCSTAT_INCR(arcstat_raw_size, size); } else { size = arc_hdr_size(hdr); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, size, hdr, - do_adapt); + alloc_flags); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); } @@ -3234,13 +3235,34 @@ arc_hdr_free_abd(arc_buf_hdr_t *hdr, boolean_t free_rdata) ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); } +/* + * Allocate empty anonymous ARC header. The header will get its identity + * assigned and buffers attached later as part of read or write operations. + * + * In case of read arc_read() assigns header its identify (b_dva + b_birth), + * inserts it into ARC hash to become globally visible and allocates physical + * (b_pabd) or raw (b_rabd) ABD buffer to read into from disk. On disk read + * completion arc_read_done() allocates ARC buffer(s) as needed, potentially + * sharing one of them with the physical ABD buffer. + * + * In case of write arc_alloc_buf() allocates ARC buffer to be filled with + * data. Then after compression and/or encryption arc_write_ready() allocates + * and fills (or potentially shares) physical (b_pabd) or raw (b_rabd) ABD + * buffer. On disk write completion arc_write_done() assigns the header its + * new identity (b_dva + b_birth) and inserts into ARC hash. + * + * In case of partial overwrite the old data is read first as described. Then + * arc_release() either allocates new anonymous ARC header and moves the ARC + * buffer to it, or reuses the old ARC header by discarding its identity and + * removing it from ARC hash. After buffer modification normal write process + * follows as described. + */ static arc_buf_hdr_t * arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, boolean_t protected, enum zio_compress compression_type, uint8_t complevel, - arc_buf_contents_t type, boolean_t alloc_rdata) + arc_buf_contents_t type) { arc_buf_hdr_t *hdr; - int flags = ARC_HDR_DO_ADAPT; VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA); if (protected) { @@ -3248,7 +3270,6 @@ arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, } else { hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); } - flags |= alloc_rdata ? ARC_HDR_ALLOC_RDATA : 0; ASSERT(HDR_EMPTY(hdr)); ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); @@ -3272,12 +3293,6 @@ arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, hdr->b_l1hdr.b_bufcnt = 0; hdr->b_l1hdr.b_buf = NULL; - /* - * Allocate the hdr's buffer. This will contain either - * the compressed or uncompressed data depending on the block - * it references and compressed arc enablement. - */ - arc_hdr_alloc_abd(hdr, flags); ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); return (hdr); @@ -3564,7 +3579,7 @@ arc_buf_t * arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size) { arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size, - B_FALSE, ZIO_COMPRESS_OFF, 0, type, B_FALSE); + B_FALSE, ZIO_COMPRESS_OFF, 0, type); arc_buf_t *buf = NULL; VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE, B_FALSE, @@ -3588,7 +3603,7 @@ arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize, ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS); arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, - B_FALSE, compression_type, complevel, ARC_BUFC_DATA, B_FALSE); + B_FALSE, compression_type, complevel, ARC_BUFC_DATA); arc_buf_t *buf = NULL; VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE, @@ -3596,16 +3611,12 @@ arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize, arc_buf_thaw(buf); ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); - if (!arc_buf_is_shared(buf)) { - /* - * To ensure that the hdr has the correct data in it if we call - * arc_untransform() on this buf before it's been written to - * disk, it's easiest if we just set up sharing between the - * buf and the hdr. - */ - arc_hdr_free_abd(hdr, B_FALSE); - arc_share_buf(hdr, buf); - } + /* + * To ensure that the hdr has the correct data in it if we call + * arc_untransform() on this buf before it's been written to disk, + * it's easiest if we just set up sharing between the buf and the hdr. + */ + arc_share_buf(hdr, buf); return (buf); } @@ -3627,7 +3638,7 @@ arc_alloc_raw_buf(spa_t *spa, void *tag, uint64_t dsobj, boolean_t byteorder, ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS); hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, B_TRUE, - compression_type, complevel, type, B_TRUE); + compression_type, complevel, type); hdr->b_crypt_hdr.b_dsobj = dsobj; hdr->b_crypt_hdr.b_ot = ot; @@ -5125,7 +5136,7 @@ arc_adapt(int bytes, arc_state_t *state) * zfs_arc_overflow_shift. */ static arc_ovf_level_t -arc_is_overflowing(void) +arc_is_overflowing(boolean_t use_reserve) { /* Always allow at least one block of overflow */ int64_t overflow = MAX(SPA_MAXBLOCKSIZE, @@ -5142,17 +5153,19 @@ arc_is_overflowing(void) */ int64_t over = aggsum_lower_bound(&arc_sums.arcstat_size) - arc_c - overflow / 2; + if (!use_reserve) + overflow /= 2; return (over < 0 ? ARC_OVF_NONE : over < overflow ? ARC_OVF_SOME : ARC_OVF_SEVERE); } static abd_t * arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, void *tag, - boolean_t do_adapt) + int alloc_flags) { arc_buf_contents_t type = arc_buf_type(hdr); - arc_get_data_impl(hdr, size, tag, do_adapt); + arc_get_data_impl(hdr, size, tag, alloc_flags); if (type == ARC_BUFC_METADATA) { return (abd_alloc(size, B_TRUE)); } else { @@ -5166,7 +5179,7 @@ arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag) { arc_buf_contents_t type = arc_buf_type(hdr); - arc_get_data_impl(hdr, size, tag, B_TRUE); + arc_get_data_impl(hdr, size, tag, ARC_HDR_DO_ADAPT); if (type == ARC_BUFC_METADATA) { return (zio_buf_alloc(size)); } else { @@ -5183,9 +5196,9 @@ arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag) * of ARC behavior and settings. See arc_lowmem_init(). */ void -arc_wait_for_eviction(uint64_t amount) +arc_wait_for_eviction(uint64_t amount, boolean_t use_reserve) { - switch (arc_is_overflowing()) { + switch (arc_is_overflowing(use_reserve)) { case ARC_OVF_NONE: return; case ARC_OVF_SOME: @@ -5262,12 +5275,12 @@ arc_wait_for_eviction(uint64_t amount) */ static void arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag, - boolean_t do_adapt) + int alloc_flags) { arc_state_t *state = hdr->b_l1hdr.b_state; arc_buf_contents_t type = arc_buf_type(hdr); - if (do_adapt) + if (alloc_flags & ARC_HDR_DO_ADAPT) arc_adapt(size, state); /* @@ -5283,7 +5296,8 @@ arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag, * ensure that that progress is also made towards getting arc_size * under arc_c. See the comment above zfs_arc_eviction_pct. */ - arc_wait_for_eviction(size * zfs_arc_eviction_pct / 100); + arc_wait_for_eviction(size * zfs_arc_eviction_pct / 100, + alloc_flags & ARC_HDR_USE_RESERVE); VERIFY3U(hdr->b_type, ==, type); if (type == ARC_BUFC_METADATA) { @@ -6093,8 +6107,7 @@ top: arc_buf_hdr_t *exists = NULL; arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, - BP_IS_PROTECTED(bp), BP_GET_COMPRESS(bp), 0, type, - encrypted_read); + BP_IS_PROTECTED(bp), BP_GET_COMPRESS(bp), 0, type); if (!embedded_bp) { hdr->b_dva = *BP_IDENTITY(bp); @@ -6108,6 +6121,7 @@ top: arc_hdr_destroy(hdr); goto top; /* restart the IO request */ } + alloc_flags |= ARC_HDR_DO_ADAPT; } else { /* * This block is in the ghost cache or encrypted data @@ -6155,9 +6169,9 @@ top: */ arc_adapt(arc_hdr_size(hdr), hdr->b_l1hdr.b_state); arc_access(hdr, hash_lock); - arc_hdr_alloc_abd(hdr, alloc_flags); } + arc_hdr_alloc_abd(hdr, alloc_flags); if (encrypted_read) { ASSERT(HDR_HAS_RABD(hdr)); size = HDR_GET_PSIZE(hdr); @@ -6679,7 +6693,7 @@ arc_release(arc_buf_t *buf, void *tag) * buffer which will be freed in arc_write(). */ nhdr = arc_hdr_alloc(spa, psize, lsize, protected, - compress, hdr->b_complevel, type, HDR_HAS_RABD(hdr)); + compress, hdr->b_complevel, type); ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL); ASSERT0(nhdr->b_l1hdr.b_bufcnt); ASSERT0(zfs_refcount_count(&nhdr->b_l1hdr.b_refcnt)); @@ -6859,7 +6873,8 @@ arc_write_ready(zio_t *zio) if (ARC_BUF_ENCRYPTED(buf)) { ASSERT3U(psize, >, 0); ASSERT(ARC_BUF_COMPRESSED(buf)); - arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT|ARC_HDR_ALLOC_RDATA); + arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT | ARC_HDR_ALLOC_RDATA | + ARC_HDR_USE_RESERVE); abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize); } else if (zfs_abd_scatter_enabled || !arc_can_share(hdr, buf)) { /* @@ -6869,17 +6884,19 @@ arc_write_ready(zio_t *zio) */ if (BP_IS_ENCRYPTED(bp)) { ASSERT3U(psize, >, 0); - arc_hdr_alloc_abd(hdr, - ARC_HDR_DO_ADAPT|ARC_HDR_ALLOC_RDATA); + arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT | + ARC_HDR_ALLOC_RDATA | ARC_HDR_USE_RESERVE); abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize); } else if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF && !ARC_BUF_COMPRESSED(buf)) { ASSERT3U(psize, >, 0); - arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT); + arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT | + ARC_HDR_USE_RESERVE); abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize); } else { ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr)); - arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT); + arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT | + ARC_HDR_USE_RESERVE); abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data, arc_buf_size(buf)); } @@ -8689,7 +8706,7 @@ l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb) */ if (BP_IS_ENCRYPTED(bp)) { abd_t *eabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, - B_TRUE); + ARC_HDR_DO_ADAPT | ARC_HDR_USE_RESERVE); zio_crypt_decode_params_bp(bp, salt, iv); zio_crypt_decode_mac_bp(bp, mac); @@ -8726,7 +8743,7 @@ l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb) if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) { abd_t *cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, - B_TRUE); + ARC_HDR_DO_ADAPT | ARC_HDR_USE_RESERVE); void *tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr)); ret = zio_decompress_data(HDR_GET_COMPRESS(hdr),