From a8d83e2a24de6419dc58d2a7b8f38904985726cb Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Wed, 8 Mar 2023 14:17:23 -0500 Subject: [PATCH] More adaptive ARC eviction Traditionally ARC adaptation was limited to MRU/MFU distribution. But for years people with metadata-centric workload demanded mechanisms to also manage data/metadata distribution, that in original ZFS was just a FIFO. As result ZFS effectively got separate states for data and metadata, minimum and maximum metadata limits etc, but it all required manual tuning, was not adaptive and in its heart remained a bad FIFO. This change removes most of existing eviction logic, rewriting it from scratch. This makes MRU/MFU adaptation individual for data and meta- data, same as the distribution between data and metadata themselves. Since most of required states separation was already done, it only required to make arcs_size state field specific per data/metadata. The adaptation logic is still based on previous concept of ghost hits, just now it balances ARC capacity between 4 states: MRU data, MRU metadata, MFU data and MFU metadata. To simplify arc_c changes instead of arc_p measured in bytes, this code uses 3 variable arc_meta, arc_pd and arc_pm, representing ARC balance between metadata and data, MRU and MFU for data, and MRU and MFU for metadata respectively as 32-bit fixed point fractions. Since we care about the math result only when need to evict, this moves all the logic from arc_adapt() to arc_evict(), that reduces per-block overhead, since per-block operations are limited to stats collection, now moved from arc_adapt() to arc_access() and using cheaper wmsums. This also allows to remove ugly ARC_HDR_DO_ADAPT flag from many places. This change also removes number of metadata specific tunables, part of which were actually not functioning correctly, since not all metadata are equal and some (like L2ARC headers) are not really evictable. Instead it introduced single opaque knob zfs_arc_meta_balance, tuning ARC's reaction on ghost hits, allowing administrator give more or less preference to metadata without setting strict limits. Some of old code parts like arc_evict_meta() are just removed, because since introduction of ABD ARC they really make no sense: only headers referenced by small number of buffers are not evictable, and they are really not evictable no matter what this code do. Instead just call arc_prune_async() if too much metadata appear not evictable. Reviewed-by: Brian Behlendorf Reviewed-by: Allan Jude Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #14359 --- cmd/arc_summary | 101 ++- cmd/zdb/zdb.c | 5 +- include/sys/arc.h | 1 - include/sys/arc_impl.h | 43 +- man/man4/zfs.4 | 82 +-- module/os/freebsd/zfs/arc_os.c | 2 +- module/os/freebsd/zfs/sysctl_os.c | 77 ++- module/os/linux/zfs/arc_os.c | 2 +- module/zfs/arc.c | 941 +++++++++----------------- tests/zfs-tests/tests/perf/perf.shlib | 1 - 10 files changed, 474 insertions(+), 781 deletions(-) diff --git a/cmd/arc_summary b/cmd/arc_summary index 7149629468..5d10e903fc 100755 --- a/cmd/arc_summary +++ b/cmd/arc_summary @@ -270,16 +270,14 @@ def draw_graph(kstats_dict): arc_perc = f_perc(arc_stats['size'], arc_stats['c_max']) mfu_size = f_bytes(arc_stats['mfu_size']) mru_size = f_bytes(arc_stats['mru_size']) - meta_limit = f_bytes(arc_stats['arc_meta_limit']) meta_size = f_bytes(arc_stats['arc_meta_used']) dnode_limit = f_bytes(arc_stats['arc_dnode_limit']) dnode_size = f_bytes(arc_stats['dnode_size']) - info_form = ('ARC: {0} ({1}) MFU: {2} MRU: {3} META: {4} ({5}) ' - 'DNODE {6} ({7})') + info_form = ('ARC: {0} ({1}) MFU: {2} MRU: {3} META: {4} ' + 'DNODE {5} ({6})') info_line = info_form.format(arc_size, arc_perc, mfu_size, mru_size, - meta_size, meta_limit, dnode_size, - dnode_limit) + meta_size, dnode_size, dnode_limit) info_spc = ' '*int((GRAPH_WIDTH-len(info_line))/2) info_line = GRAPH_INDENT+info_spc+info_line @@ -558,16 +556,28 @@ def section_arc(kstats_dict): arc_target_size = arc_stats['c'] arc_max = arc_stats['c_max'] arc_min = arc_stats['c_min'] - anon_size = arc_stats['anon_size'] - mfu_size = arc_stats['mfu_size'] - mru_size = arc_stats['mru_size'] - mfug_size = arc_stats['mfu_ghost_size'] - mrug_size = arc_stats['mru_ghost_size'] - unc_size = arc_stats['uncached_size'] - meta_limit = arc_stats['arc_meta_limit'] - meta_size = arc_stats['arc_meta_used'] + meta = arc_stats['meta'] + pd = arc_stats['pd'] + pm = arc_stats['pm'] + anon_data = arc_stats['anon_data'] + anon_metadata = arc_stats['anon_metadata'] + mfu_data = arc_stats['mfu_data'] + mfu_metadata = arc_stats['mfu_metadata'] + mru_data = arc_stats['mru_data'] + mru_metadata = arc_stats['mru_metadata'] + mfug_data = arc_stats['mfu_ghost_data'] + mfug_metadata = arc_stats['mfu_ghost_metadata'] + mrug_data = arc_stats['mru_ghost_data'] + mrug_metadata = arc_stats['mru_ghost_metadata'] + unc_data = arc_stats['uncached_data'] + unc_metadata = arc_stats['uncached_metadata'] + bonus_size = arc_stats['bonus_size'] dnode_limit = arc_stats['arc_dnode_limit'] dnode_size = arc_stats['dnode_size'] + dbuf_size = arc_stats['dbuf_size'] + hdr_size = arc_stats['hdr_size'] + l2_hdr_size = arc_stats['l2_hdr_size'] + abd_chunk_waste_size = arc_stats['abd_chunk_waste_size'] target_size_ratio = '{0}:1'.format(int(arc_max) // int(arc_min)) prt_2('ARC size (current):', @@ -578,25 +588,56 @@ def section_arc(kstats_dict): f_perc(arc_min, arc_max), f_bytes(arc_min)) prt_i2('Max size (high water):', target_size_ratio, f_bytes(arc_max)) - caches_size = int(anon_size)+int(mfu_size)+int(mru_size)+int(unc_size) - prt_i2('Anonymouns data size:', - f_perc(anon_size, caches_size), f_bytes(anon_size)) - prt_i2('Most Frequently Used (MFU) cache size:', - f_perc(mfu_size, caches_size), f_bytes(mfu_size)) - prt_i2('Most Recently Used (MRU) cache size:', - f_perc(mru_size, caches_size), f_bytes(mru_size)) - prt_i1('Most Frequently Used (MFU) ghost size:', f_bytes(mfug_size)) - prt_i1('Most Recently Used (MRU) ghost size:', f_bytes(mrug_size)) + caches_size = int(anon_data)+int(anon_metadata)+\ + int(mfu_data)+int(mfu_metadata)+int(mru_data)+int(mru_metadata)+\ + int(unc_data)+int(unc_metadata) + prt_i2('Anonymous data size:', + f_perc(anon_data, caches_size), f_bytes(anon_data)) + prt_i2('Anonymous metadata size:', + f_perc(anon_metadata, caches_size), f_bytes(anon_metadata)) + s = 4294967296 + v = (s-int(pd))*(s-int(meta))/s + prt_i2('MFU data target:', f_perc(v, s), + f_bytes(v / 65536 * caches_size / 65536)) + prt_i2('MFU data size:', + f_perc(mfu_data, caches_size), f_bytes(mfu_data)) + prt_i1('MFU ghost data size:', f_bytes(mfug_data)) + v = (s-int(pm))*int(meta)/s + prt_i2('MFU metadata target:', f_perc(v, s), + f_bytes(v / 65536 * caches_size / 65536)) + prt_i2('MFU metadata size:', + f_perc(mfu_metadata, caches_size), f_bytes(mfu_metadata)) + prt_i1('MFU ghost metadata size:', f_bytes(mfug_metadata)) + v = int(pd)*(s-int(meta))/s + prt_i2('MRU data target:', f_perc(v, s), + f_bytes(v / 65536 * caches_size / 65536)) + prt_i2('MRU data size:', + f_perc(mru_data, caches_size), f_bytes(mru_data)) + prt_i1('MRU ghost data size:', f_bytes(mrug_data)) + v = int(pm)*int(meta)/s + prt_i2('MRU metadata target:', f_perc(v, s), + f_bytes(v / 65536 * caches_size / 65536)) + prt_i2('MRU metadata size:', + f_perc(mru_metadata, caches_size), f_bytes(mru_metadata)) + prt_i1('MRU ghost metadata size:', f_bytes(mrug_metadata)) prt_i2('Uncached data size:', - f_perc(unc_size, caches_size), f_bytes(unc_size)) - prt_i2('Metadata cache size (hard limit):', - f_perc(meta_limit, arc_max), f_bytes(meta_limit)) - prt_i2('Metadata cache size (current):', - f_perc(meta_size, meta_limit), f_bytes(meta_size)) - prt_i2('Dnode cache size (hard limit):', - f_perc(dnode_limit, meta_limit), f_bytes(dnode_limit)) - prt_i2('Dnode cache size (current):', + f_perc(unc_data, caches_size), f_bytes(unc_data)) + prt_i2('Uncached metadata size:', + f_perc(unc_metadata, caches_size), f_bytes(unc_metadata)) + prt_i2('Bonus size:', + f_perc(bonus_size, arc_size), f_bytes(bonus_size)) + prt_i2('Dnode cache target:', + f_perc(dnode_limit, arc_max), f_bytes(dnode_limit)) + prt_i2('Dnode cache size:', f_perc(dnode_size, dnode_limit), f_bytes(dnode_size)) + prt_i2('Dbuf size:', + f_perc(dbuf_size, arc_size), f_bytes(dbuf_size)) + prt_i2('Header size:', + f_perc(hdr_size, arc_size), f_bytes(hdr_size)) + prt_i2('L2 header size:', + f_perc(l2_hdr_size, arc_size), f_bytes(l2_hdr_size)) + prt_i2('ABD chunk waste size:', + f_perc(abd_chunk_waste_size, arc_size), f_bytes(abd_chunk_waste_size)) print() print('ARC hash breakdown:') diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index c6198ee264..329562418b 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -117,7 +117,6 @@ zdb_ot_name(dmu_object_type_t type) extern int reference_tracking_enable; extern int zfs_recover; -extern unsigned long zfs_arc_meta_min, zfs_arc_meta_limit; extern uint_t zfs_vdev_async_read_max_active; extern boolean_t spa_load_verify_dryrun; extern boolean_t spa_mode_readable_spacemaps; @@ -8809,8 +8808,8 @@ main(int argc, char **argv) * ZDB does not typically re-read blocks; therefore limit the ARC * to 256 MB, which can be used entirely for metadata. */ - zfs_arc_min = zfs_arc_meta_min = 2ULL << SPA_MAXBLOCKSHIFT; - zfs_arc_max = zfs_arc_meta_limit = 256 * 1024 * 1024; + zfs_arc_min = 2ULL << SPA_MAXBLOCKSHIFT; + zfs_arc_max = 256 * 1024 * 1024; #endif /* diff --git a/include/sys/arc.h b/include/sys/arc.h index 2b4f16ee0a..836ed679db 100644 --- a/include/sys/arc.h +++ b/include/sys/arc.h @@ -200,7 +200,6 @@ struct arc_buf { }; typedef enum arc_buf_contents { - ARC_BUFC_INVALID, /* invalid type */ ARC_BUFC_DATA, /* buffer contains data */ ARC_BUFC_METADATA, /* buffer contains metadata */ ARC_BUFC_NUMTYPES diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h index 721b050236..fd24d2f3c8 100644 --- a/include/sys/arc_impl.h +++ b/include/sys/arc_impl.h @@ -82,15 +82,18 @@ typedef struct arc_state { * supports the "dbufs" kstat */ arc_state_type_t arcs_state; + /* + * total amount of data in this state. + */ + zfs_refcount_t arcs_size[ARC_BUFC_NUMTYPES] ____cacheline_aligned; /* * total amount of evictable data in this state */ - zfs_refcount_t arcs_esize[ARC_BUFC_NUMTYPES] ____cacheline_aligned; + zfs_refcount_t arcs_esize[ARC_BUFC_NUMTYPES]; /* - * total amount of data in this state; this includes: evictable, - * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA. + * amount of hit bytes for this state (counted only for ghost states) */ - zfs_refcount_t arcs_size; + wmsum_t arcs_hits[ARC_BUFC_NUMTYPES]; } arc_state_t; typedef struct arc_callback arc_callback_t; @@ -358,8 +361,9 @@ typedef struct l2arc_lb_ptr_buf { #define L2BLK_SET_PREFETCH(field, x) BF64_SET((field), 39, 1, x) #define L2BLK_GET_CHECKSUM(field) BF64_GET((field), 40, 8) #define L2BLK_SET_CHECKSUM(field, x) BF64_SET((field), 40, 8, x) -#define L2BLK_GET_TYPE(field) BF64_GET((field), 48, 8) -#define L2BLK_SET_TYPE(field, x) BF64_SET((field), 48, 8, x) +/* +/- 1 here are to keep compatibility after ARC_BUFC_INVALID removal. */ +#define L2BLK_GET_TYPE(field) (BF64_GET((field), 48, 8) - 1) +#define L2BLK_SET_TYPE(field, x) BF64_SET((field), 48, 8, (x) + 1) #define L2BLK_GET_PROTECTED(field) BF64_GET((field), 56, 1) #define L2BLK_SET_PROTECTED(field, x) BF64_SET((field), 56, 1, x) #define L2BLK_GET_STATE(field) BF64_GET((field), 57, 4) @@ -582,7 +586,9 @@ typedef struct arc_stats { kstat_named_t arcstat_hash_collisions; kstat_named_t arcstat_hash_chains; kstat_named_t arcstat_hash_chain_max; - kstat_named_t arcstat_p; + kstat_named_t arcstat_meta; + kstat_named_t arcstat_pd; + kstat_named_t arcstat_pm; kstat_named_t arcstat_c; kstat_named_t arcstat_c_min; kstat_named_t arcstat_c_max; @@ -655,6 +661,8 @@ typedef struct arc_stats { * are all included in this value. */ kstat_named_t arcstat_anon_size; + kstat_named_t arcstat_anon_data; + kstat_named_t arcstat_anon_metadata; /* * Number of bytes consumed by ARC buffers that meet the * following criteria: backing buffers of type ARC_BUFC_DATA, @@ -676,6 +684,8 @@ typedef struct arc_stats { * are all included in this value. */ kstat_named_t arcstat_mru_size; + kstat_named_t arcstat_mru_data; + kstat_named_t arcstat_mru_metadata; /* * Number of bytes consumed by ARC buffers that meet the * following criteria: backing buffers of type ARC_BUFC_DATA, @@ -700,6 +710,8 @@ typedef struct arc_stats { * buffers *would have* consumed this number of bytes. */ kstat_named_t arcstat_mru_ghost_size; + kstat_named_t arcstat_mru_ghost_data; + kstat_named_t arcstat_mru_ghost_metadata; /* * Number of bytes that *would have been* consumed by ARC * buffers that are eligible for eviction, of type @@ -719,6 +731,8 @@ typedef struct arc_stats { * are all included in this value. */ kstat_named_t arcstat_mfu_size; + kstat_named_t arcstat_mfu_data; + kstat_named_t arcstat_mfu_metadata; /* * Number of bytes consumed by ARC buffers that are eligible for * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu @@ -737,6 +751,8 @@ typedef struct arc_stats { * arcstat_mru_ghost_size for more details. */ kstat_named_t arcstat_mfu_ghost_size; + kstat_named_t arcstat_mfu_ghost_data; + kstat_named_t arcstat_mfu_ghost_metadata; /* * Number of bytes that *would have been* consumed by ARC * buffers that are eligible for eviction, of type @@ -754,6 +770,8 @@ typedef struct arc_stats { * ARC_FLAG_UNCACHED being set. */ kstat_named_t arcstat_uncached_size; + kstat_named_t arcstat_uncached_data; + kstat_named_t arcstat_uncached_metadata; /* * Number of data bytes that are going to be evicted from ARC due to * ARC_FLAG_UNCACHED being set. @@ -876,10 +894,7 @@ typedef struct arc_stats { kstat_named_t arcstat_loaned_bytes; kstat_named_t arcstat_prune; kstat_named_t arcstat_meta_used; - kstat_named_t arcstat_meta_limit; kstat_named_t arcstat_dnode_limit; - kstat_named_t arcstat_meta_max; - kstat_named_t arcstat_meta_min; kstat_named_t arcstat_async_upgrade_sync; /* Number of predictive prefetch requests. */ kstat_named_t arcstat_predictive_prefetch; @@ -942,7 +957,7 @@ typedef struct arc_sums { wmsum_t arcstat_data_size; wmsum_t arcstat_metadata_size; wmsum_t arcstat_dbuf_size; - aggsum_t arcstat_dnode_size; + wmsum_t arcstat_dnode_size; wmsum_t arcstat_bonus_size; wmsum_t arcstat_l2_hits; wmsum_t arcstat_l2_misses; @@ -987,7 +1002,7 @@ typedef struct arc_sums { wmsum_t arcstat_memory_direct_count; wmsum_t arcstat_memory_indirect_count; wmsum_t arcstat_prune; - aggsum_t arcstat_meta_used; + wmsum_t arcstat_meta_used; wmsum_t arcstat_async_upgrade_sync; wmsum_t arcstat_predictive_prefetch; wmsum_t arcstat_demand_hit_predictive_prefetch; @@ -1015,7 +1030,9 @@ typedef struct arc_evict_waiter { #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) #define arc_no_grow ARCSTAT(arcstat_no_grow) /* do not grow cache size */ -#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ +#define arc_meta ARCSTAT(arcstat_meta) /* target frac of metadata */ +#define arc_pd ARCSTAT(arcstat_pd) /* target frac of data MRU */ +#define arc_pm ARCSTAT(arcstat_pm) /* target frac of meta MRU */ #define arc_c ARCSTAT(arcstat_c) /* target size of cache */ #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 88a044f63f..e8e2cfec61 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -558,14 +558,6 @@ This value acts as a ceiling to the amount of dnode metadata, and defaults to which indicates that a percent which is based on .Sy zfs_arc_dnode_limit_percent of the ARC meta buffers that may be used for dnodes. -.Pp -Also see -.Sy zfs_arc_meta_prune -which serves a similar purpose but is used -when the amount of metadata in the ARC exceeds -.Sy zfs_arc_meta_limit -rather than in response to overall demand for non-metadata. -. .It Sy zfs_arc_dnode_limit_percent Ns = Ns Sy 10 Ns % Pq u64 Percentage that can be consumed by dnodes of ARC meta buffers. .Pp @@ -648,62 +640,10 @@ It cannot be set back to while running, and reducing it below the current ARC size will not cause the ARC to shrink without memory pressure to induce shrinking. . -.It Sy zfs_arc_meta_adjust_restarts Ns = Ns Sy 4096 Pq uint -The number of restart passes to make while scanning the ARC attempting -the free buffers in order to stay below the -.Sy fs_arc_meta_limit . -This value should not need to be tuned but is available to facilitate -performance analysis. -. -.It Sy zfs_arc_meta_limit Ns = Ns Sy 0 Ns B Pq u64 -The maximum allowed size in bytes that metadata buffers are allowed to -consume in the ARC. -When this limit is reached, metadata buffers will be reclaimed, -even if the overall -.Sy arc_c_max -has not been reached. -It defaults to -.Sy 0 , -which indicates that a percentage based on -.Sy zfs_arc_meta_limit_percent -of the ARC may be used for metadata. -.Pp -This value my be changed dynamically, except that must be set to an explicit -value -.Pq cannot be set back to Sy 0 . -. -.It Sy zfs_arc_meta_limit_percent Ns = Ns Sy 75 Ns % Pq u64 -Percentage of ARC buffers that can be used for metadata. -.Pp -See also -.Sy zfs_arc_meta_limit , -which serves a similar purpose but has a higher priority if nonzero. -. -.It Sy zfs_arc_meta_min Ns = Ns Sy 0 Ns B Pq u64 -The minimum allowed size in bytes that metadata buffers may consume in -the ARC. -. -.It Sy zfs_arc_meta_prune Ns = Ns Sy 10000 Pq int -The number of dentries and inodes to be scanned looking for entries -which can be dropped. -This may be required when the ARC reaches the -.Sy zfs_arc_meta_limit -because dentries and inodes can pin buffers in the ARC. -Increasing this value will cause to dentry and inode caches -to be pruned more aggressively. -Setting this value to -.Sy 0 -will disable pruning the inode and dentry caches. -. -.It Sy zfs_arc_meta_strategy Ns = Ns Sy 1 Ns | Ns 0 Pq uint -Define the strategy for ARC metadata buffer eviction (meta reclaim strategy): -.Bl -tag -compact -offset 4n -width "0 (META_ONLY)" -.It Sy 0 Pq META_ONLY -evict only the ARC metadata buffers -.It Sy 1 Pq BALANCED -additional data buffers may be evicted if required -to evict the required number of metadata buffers. -.El +.It Sy zfs_arc_meta_balance Ns = Ns Sy 500 Pq uint +Balance between metadata and data on ghost hits. +Values above 100 increase metadata caching by proportionally reducing effect +of ghost data hits on target data/metadata rate. . .It Sy zfs_arc_min Ns = Ns Sy 0 Ns B Pq u64 Min size of ARC in bytes. @@ -786,20 +726,6 @@ causes the ARC to start reclamation if it exceeds the target size by of the target size, and block allocations by .Em 0.6% . . -.It Sy zfs_arc_p_min_shift Ns = Ns Sy 0 Pq uint -If nonzero, this will update -.Sy arc_p_min_shift Pq default Sy 4 -with the new value. -.Sy arc_p_min_shift No is used as a shift of Sy arc_c -when calculating the minumum -.Sy arc_p No size . -. -.It Sy zfs_arc_p_dampener_disable Ns = Ns Sy 1 Ns | Ns 0 Pq int -Disable -.Sy arc_p -adapt dampener, which reduces the maximum single adjustment to -.Sy arc_p . -. .It Sy zfs_arc_shrink_shift Ns = Ns Sy 0 Pq uint If nonzero, this will update .Sy arc_shrink_shift Pq default Sy 7 diff --git a/module/os/freebsd/zfs/arc_os.c b/module/os/freebsd/zfs/arc_os.c index dfe5c3d311..a2ff0f386a 100644 --- a/module/os/freebsd/zfs/arc_os.c +++ b/module/os/freebsd/zfs/arc_os.c @@ -159,7 +159,7 @@ arc_prune_task(void *arg) /* * Notify registered consumers they must drop holds on a portion of the ARC * buffered they reference. This provides a mechanism to ensure the ARC can - * honor the arc_meta_limit and reclaim otherwise pinned ARC buffers. This + * honor the metadata limit and reclaim otherwise pinned ARC buffers. This * is analogous to dnlc_reduce_cache() but more generic. * * This operation is performed asynchronously so it may be safely called diff --git a/module/os/freebsd/zfs/sysctl_os.c b/module/os/freebsd/zfs/sysctl_os.c index bd6cfc86ce..35edea0a24 100644 --- a/module/os/freebsd/zfs/sysctl_os.c +++ b/module/os/freebsd/zfs/sysctl_os.c @@ -359,89 +359,114 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, "No reads during writes (LEGACY)"); /* END CSTYLED */ +static int +param_get_arc_state_size(SYSCTL_HANDLER_ARGS) +{ + arc_state_t *state = (arc_state_t *)arg1; + int64_t val; + + val = zfs_refcount_count(&state->arcs_size[ARC_BUFC_DATA]) + + zfs_refcount_count(&state->arcs_size[ARC_BUFC_METADATA]); + return (sysctl_handle_64(oidp, &val, 0, req)); +} + extern arc_state_t ARC_anon; /* BEGIN CSTYLED */ -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD, - &ARC_anon.arcs_size.rc_count, 0, "size of anonymous state"); +SYSCTL_PROC(_vfs_zfs, OID_AUTO, anon_size, + CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, + &ARC_anon, 0, param_get_arc_state_size, "Q", + "size of anonymous state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_esize, CTLFLAG_RD, &ARC_anon.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, - "size of metadata in anonymous state"); + "size of evictable metadata in anonymous state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_esize, CTLFLAG_RD, &ARC_anon.arcs_esize[ARC_BUFC_DATA].rc_count, 0, - "size of data in anonymous state"); + "size of evictable data in anonymous state"); /* END CSTYLED */ extern arc_state_t ARC_mru; /* BEGIN CSTYLED */ -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD, - &ARC_mru.arcs_size.rc_count, 0, "size of mru state"); +SYSCTL_PROC(_vfs_zfs, OID_AUTO, mru_size, + CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, + &ARC_mru, 0, param_get_arc_state_size, "Q", + "size of mru state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_esize, CTLFLAG_RD, &ARC_mru.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, - "size of metadata in mru state"); + "size of evictable metadata in mru state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_esize, CTLFLAG_RD, &ARC_mru.arcs_esize[ARC_BUFC_DATA].rc_count, 0, - "size of data in mru state"); + "size of evictable data in mru state"); /* END CSTYLED */ extern arc_state_t ARC_mru_ghost; /* BEGIN CSTYLED */ -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD, - &ARC_mru_ghost.arcs_size.rc_count, 0, "size of mru ghost state"); +SYSCTL_PROC(_vfs_zfs, OID_AUTO, mru_ghost_size, + CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, + &ARC_mru_ghost, 0, param_get_arc_state_size, "Q", + "size of mru ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_esize, CTLFLAG_RD, &ARC_mru_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, - "size of metadata in mru ghost state"); + "size of evictable metadata in mru ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_esize, CTLFLAG_RD, &ARC_mru_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, - "size of data in mru ghost state"); + "size of evictable data in mru ghost state"); /* END CSTYLED */ extern arc_state_t ARC_mfu; /* BEGIN CSTYLED */ -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD, - &ARC_mfu.arcs_size.rc_count, 0, "size of mfu state"); +SYSCTL_PROC(_vfs_zfs, OID_AUTO, mfu_size, + CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, + &ARC_mfu, 0, param_get_arc_state_size, "Q", + "size of mfu state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_esize, CTLFLAG_RD, &ARC_mfu.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, - "size of metadata in mfu state"); + "size of evictable metadata in mfu state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_esize, CTLFLAG_RD, &ARC_mfu.arcs_esize[ARC_BUFC_DATA].rc_count, 0, - "size of data in mfu state"); + "size of evictable data in mfu state"); /* END CSTYLED */ extern arc_state_t ARC_mfu_ghost; /* BEGIN CSTYLED */ -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD, - &ARC_mfu_ghost.arcs_size.rc_count, 0, "size of mfu ghost state"); +SYSCTL_PROC(_vfs_zfs, OID_AUTO, mfu_ghost_size, + CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, + &ARC_mfu_ghost, 0, param_get_arc_state_size, "Q", + "size of mfu ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_esize, CTLFLAG_RD, &ARC_mfu_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, - "size of metadata in mfu ghost state"); + "size of evictable metadata in mfu ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_esize, CTLFLAG_RD, &ARC_mfu_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, - "size of data in mfu ghost state"); + "size of evictable data in mfu ghost state"); /* END CSTYLED */ extern arc_state_t ARC_uncached; /* BEGIN CSTYLED */ -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, uncached_size, CTLFLAG_RD, - &ARC_uncached.arcs_size.rc_count, 0, "size of uncached state"); +SYSCTL_PROC(_vfs_zfs, OID_AUTO, uncached_size, + CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, + &ARC_uncached, 0, param_get_arc_state_size, "Q", + "size of uncached state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, uncached_metadata_esize, CTLFLAG_RD, &ARC_uncached.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, - "size of metadata in uncached state"); + "size of evictable metadata in uncached state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, uncached_data_esize, CTLFLAG_RD, &ARC_uncached.arcs_esize[ARC_BUFC_DATA].rc_count, 0, - "size of data in uncached state"); + "size of evictable data in uncached state"); /* END CSTYLED */ extern arc_state_t ARC_l2c_only; /* BEGIN CSTYLED */ -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD, - &ARC_l2c_only.arcs_size.rc_count, 0, "size of mru state"); +SYSCTL_PROC(_vfs_zfs, OID_AUTO, l2c_only_size, + CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, + &ARC_l2c_only, 0, param_get_arc_state_size, "Q", + "size of l2c_only state"); /* END CSTYLED */ /* dbuf.c */ diff --git a/module/os/linux/zfs/arc_os.c b/module/os/linux/zfs/arc_os.c index 6f730e9ddd..b7d6053529 100644 --- a/module/os/linux/zfs/arc_os.c +++ b/module/os/linux/zfs/arc_os.c @@ -504,7 +504,7 @@ arc_prune_task(void *ptr) /* * Notify registered consumers they must drop holds on a portion of the ARC * buffered they reference. This provides a mechanism to ensure the ARC can - * honor the arc_meta_limit and reclaim otherwise pinned ARC buffers. This + * honor the metadata limit and reclaim otherwise pinned ARC buffers. This * is analogous to dnlc_reduce_cache() but more generic. * * This operation is performed asynchronously so it may be safely called diff --git a/module/zfs/arc.c b/module/zfs/arc.c index d851e919e1..aff438777c 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -108,12 +108,11 @@ * the active state mutex must be held before the ghost state mutex. * * It as also possible to register a callback which is run when the - * arc_meta_limit is reached and no buffers can be safely evicted. In + * metadata limit is reached and no buffers can be safely evicted. In * this case the arc user should drop a reference on some arc buffers so - * they can be reclaimed and the arc_meta_limit honored. For example, - * when using the ZPL each dentry holds a references on a znode. These - * dentries must be pruned before the arc buffer holding the znode can - * be safely evicted. + * they can be reclaimed. For example, when using the ZPL each dentry + * holds a references on a znode. These dentries must be pruned before + * the arc buffer holding the znode can be safely evicted. * * Note that the majority of the performance stats are manipulated * with atomic operations. @@ -377,9 +376,6 @@ static const int arc_kmem_cache_reap_retry_ms = 1000; /* shift of arc_c for calculating overflow limit in arc_get_data_impl */ static int zfs_arc_overflow_shift = 8; -/* shift of arc_c for calculating both min and max arc_p */ -static uint_t arc_p_min_shift = 4; - /* log2(fraction of arc to reclaim) */ uint_t arc_shrink_shift = 7; @@ -422,13 +418,10 @@ boolean_t arc_warm; */ uint64_t zfs_arc_max = 0; uint64_t zfs_arc_min = 0; -uint64_t zfs_arc_meta_limit = 0; -uint64_t zfs_arc_meta_min = 0; static uint64_t zfs_arc_dnode_limit = 0; static uint_t zfs_arc_dnode_reduce_percent = 10; static uint_t zfs_arc_grow_retry = 0; static uint_t zfs_arc_shrink_shift = 0; -static uint_t zfs_arc_p_min_shift = 0; uint_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ /* @@ -447,10 +440,11 @@ static const unsigned long zfs_arc_pool_dirty_percent = 20; int zfs_compressed_arc_enabled = B_TRUE; /* - * ARC will evict meta buffers that exceed arc_meta_limit. This - * tunable make arc_meta_limit adjustable for different workloads. + * Balance between metadata and data on ghost hits. Values above 100 + * increase metadata caching by proportionally reducing effect of ghost + * data hits on target data/metadata rate. */ -static uint64_t zfs_arc_meta_limit_percent = 75; +static uint_t zfs_arc_meta_balance = 500; /* * Percentage that can be consumed by dnodes of ARC meta buffers. @@ -463,10 +457,6 @@ static uint_t zfs_arc_dnode_limit_percent = 10; static uint64_t zfs_arc_sys_free = 0; static uint_t zfs_arc_min_prefetch_ms = 0; static uint_t zfs_arc_min_prescient_prefetch_ms = 0; -static int zfs_arc_p_dampener_disable = 1; -static uint_t zfs_arc_meta_prune = 10000; -static uint_t zfs_arc_meta_strategy = ARC_STRATEGY_META_BALANCED; -static uint_t zfs_arc_meta_adjust_restarts = 4096; static uint_t zfs_arc_lotsfree_percent = 10; /* @@ -520,7 +510,9 @@ arc_stats_t arc_stats = { { "hash_collisions", KSTAT_DATA_UINT64 }, { "hash_chains", KSTAT_DATA_UINT64 }, { "hash_chain_max", KSTAT_DATA_UINT64 }, - { "p", KSTAT_DATA_UINT64 }, + { "meta", KSTAT_DATA_UINT64 }, + { "pd", KSTAT_DATA_UINT64 }, + { "pm", KSTAT_DATA_UINT64 }, { "c", KSTAT_DATA_UINT64 }, { "c_min", KSTAT_DATA_UINT64 }, { "c_max", KSTAT_DATA_UINT64 }, @@ -538,21 +530,33 @@ arc_stats_t arc_stats = { { "other_size", KSTAT_DATA_UINT64 }, #endif { "anon_size", KSTAT_DATA_UINT64 }, + { "anon_data", KSTAT_DATA_UINT64 }, + { "anon_metadata", KSTAT_DATA_UINT64 }, { "anon_evictable_data", KSTAT_DATA_UINT64 }, { "anon_evictable_metadata", KSTAT_DATA_UINT64 }, { "mru_size", KSTAT_DATA_UINT64 }, + { "mru_data", KSTAT_DATA_UINT64 }, + { "mru_metadata", KSTAT_DATA_UINT64 }, { "mru_evictable_data", KSTAT_DATA_UINT64 }, { "mru_evictable_metadata", KSTAT_DATA_UINT64 }, { "mru_ghost_size", KSTAT_DATA_UINT64 }, + { "mru_ghost_data", KSTAT_DATA_UINT64 }, + { "mru_ghost_metadata", KSTAT_DATA_UINT64 }, { "mru_ghost_evictable_data", KSTAT_DATA_UINT64 }, { "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, { "mfu_size", KSTAT_DATA_UINT64 }, + { "mfu_data", KSTAT_DATA_UINT64 }, + { "mfu_metadata", KSTAT_DATA_UINT64 }, { "mfu_evictable_data", KSTAT_DATA_UINT64 }, { "mfu_evictable_metadata", KSTAT_DATA_UINT64 }, { "mfu_ghost_size", KSTAT_DATA_UINT64 }, + { "mfu_ghost_data", KSTAT_DATA_UINT64 }, + { "mfu_ghost_metadata", KSTAT_DATA_UINT64 }, { "mfu_ghost_evictable_data", KSTAT_DATA_UINT64 }, { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, { "uncached_size", KSTAT_DATA_UINT64 }, + { "uncached_data", KSTAT_DATA_UINT64 }, + { "uncached_metadata", KSTAT_DATA_UINT64 }, { "uncached_evictable_data", KSTAT_DATA_UINT64 }, { "uncached_evictable_metadata", KSTAT_DATA_UINT64 }, { "l2_hits", KSTAT_DATA_UINT64 }, @@ -607,10 +611,7 @@ arc_stats_t arc_stats = { { "arc_loaned_bytes", KSTAT_DATA_UINT64 }, { "arc_prune", KSTAT_DATA_UINT64 }, { "arc_meta_used", KSTAT_DATA_UINT64 }, - { "arc_meta_limit", KSTAT_DATA_UINT64 }, { "arc_dnode_limit", KSTAT_DATA_UINT64 }, - { "arc_meta_max", KSTAT_DATA_UINT64 }, - { "arc_meta_min", KSTAT_DATA_UINT64 }, { "async_upgrade_sync", KSTAT_DATA_UINT64 }, { "predictive_prefetch", KSTAT_DATA_UINT64 }, { "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 }, @@ -683,10 +684,7 @@ static kstat_t *arc_ksp; */ #define arc_tempreserve ARCSTAT(arcstat_tempreserve) #define arc_loaned_bytes ARCSTAT(arcstat_loaned_bytes) -#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ -/* max size for dnodes */ -#define arc_dnode_size_limit ARCSTAT(arcstat_dnode_limit) -#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */ +#define arc_dnode_limit ARCSTAT(arcstat_dnode_limit) /* max size for dnodes */ #define arc_need_free ARCSTAT(arcstat_need_free) /* waiting to be evicted */ hrtime_t arc_growtime; @@ -859,7 +857,6 @@ static kcondvar_t l2arc_rebuild_thr_cv; enum arc_hdr_alloc_flags { ARC_HDR_ALLOC_RDATA = 0x1, - ARC_HDR_DO_ADAPT = 0x2, ARC_HDR_USE_RESERVE = 0x4, ARC_HDR_ALLOC_LINEAR = 0x8, }; @@ -1875,7 +1872,7 @@ arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb) ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); ASSERT(HDR_ENCRYPTED(hdr)); - arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT); + arc_hdr_alloc_abd(hdr, 0); ret = spa_do_crypt_abd(B_FALSE, spa, zb, hdr->b_crypt_hdr.b_ot, B_FALSE, bswap, hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_iv, @@ -1902,8 +1899,7 @@ arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb) * and then loan a buffer from it, rather than allocating a * linear buffer and wrapping it in an abd later. */ - cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, - ARC_HDR_DO_ADAPT); + cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, 0); tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr)); ret = zio_decompress_data(HDR_GET_COMPRESS(hdr), @@ -2420,7 +2416,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr) int64_t refcnt; uint32_t bufcnt; boolean_t update_old, update_new; - arc_buf_contents_t buftype = arc_buf_type(hdr); + arc_buf_contents_t type = arc_buf_type(hdr); /* * We almost always have an L1 hdr here, since we call arc_hdr_realloc() @@ -2465,7 +2461,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr) ASSERT(HDR_HAS_L1HDR(hdr)); /* remove_reference() saves on insert. */ if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { - multilist_remove(&old_state->arcs_list[buftype], + multilist_remove(&old_state->arcs_list[type], hdr); arc_evictable_space_decrement(hdr, old_state); } @@ -2478,7 +2474,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr) * beforehand. */ ASSERT(HDR_HAS_L1HDR(hdr)); - multilist_insert(&new_state->arcs_list[buftype], hdr); + multilist_insert(&new_state->arcs_list[type], hdr); arc_evictable_space_increment(hdr, new_state); } } @@ -2501,7 +2497,8 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr) * the reference. As a result, we use the arc * header pointer for the reference. */ - (void) zfs_refcount_add_many(&new_state->arcs_size, + (void) zfs_refcount_add_many( + &new_state->arcs_size[type], HDR_GET_LSIZE(hdr), hdr); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); @@ -2529,20 +2526,20 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr) continue; (void) zfs_refcount_add_many( - &new_state->arcs_size, + &new_state->arcs_size[type], arc_buf_size(buf), buf); } ASSERT3U(bufcnt, ==, buffers); if (hdr->b_l1hdr.b_pabd != NULL) { (void) zfs_refcount_add_many( - &new_state->arcs_size, + &new_state->arcs_size[type], arc_hdr_size(hdr), hdr); } if (HDR_HAS_RABD(hdr)) { (void) zfs_refcount_add_many( - &new_state->arcs_size, + &new_state->arcs_size[type], HDR_GET_PSIZE(hdr), hdr); } } @@ -2563,7 +2560,8 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr) * header on the ghost state. */ - (void) zfs_refcount_remove_many(&old_state->arcs_size, + (void) zfs_refcount_remove_many( + &old_state->arcs_size[type], HDR_GET_LSIZE(hdr), hdr); } else { uint32_t buffers = 0; @@ -2589,8 +2587,8 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr) continue; (void) zfs_refcount_remove_many( - &old_state->arcs_size, arc_buf_size(buf), - buf); + &old_state->arcs_size[type], + arc_buf_size(buf), buf); } ASSERT3U(bufcnt, ==, buffers); ASSERT(hdr->b_l1hdr.b_pabd != NULL || @@ -2598,14 +2596,14 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr) if (hdr->b_l1hdr.b_pabd != NULL) { (void) zfs_refcount_remove_many( - &old_state->arcs_size, arc_hdr_size(hdr), - hdr); + &old_state->arcs_size[type], + arc_hdr_size(hdr), hdr); } if (HDR_HAS_RABD(hdr)) { (void) zfs_refcount_remove_many( - &old_state->arcs_size, HDR_GET_PSIZE(hdr), - hdr); + &old_state->arcs_size[type], + HDR_GET_PSIZE(hdr), hdr); } } } @@ -2639,7 +2637,7 @@ arc_space_consume(uint64_t space, arc_space_type_t type) ARCSTAT_INCR(arcstat_bonus_size, space); break; case ARC_SPACE_DNODE: - aggsum_add(&arc_sums.arcstat_dnode_size, space); + ARCSTAT_INCR(arcstat_dnode_size, space); break; case ARC_SPACE_DBUF: ARCSTAT_INCR(arcstat_dbuf_size, space); @@ -2662,7 +2660,7 @@ arc_space_consume(uint64_t space, arc_space_type_t type) } if (type != ARC_SPACE_DATA && type != ARC_SPACE_ABD_CHUNK_WASTE) - aggsum_add(&arc_sums.arcstat_meta_used, space); + ARCSTAT_INCR(arcstat_meta_used, space); aggsum_add(&arc_sums.arcstat_size, space); } @@ -2685,7 +2683,7 @@ arc_space_return(uint64_t space, arc_space_type_t type) ARCSTAT_INCR(arcstat_bonus_size, -space); break; case ARC_SPACE_DNODE: - aggsum_add(&arc_sums.arcstat_dnode_size, -space); + ARCSTAT_INCR(arcstat_dnode_size, -space); break; case ARC_SPACE_DBUF: ARCSTAT_INCR(arcstat_dbuf_size, -space); @@ -2701,13 +2699,8 @@ arc_space_return(uint64_t space, arc_space_type_t type) break; } - if (type != ARC_SPACE_DATA && type != ARC_SPACE_ABD_CHUNK_WASTE) { - ASSERT(aggsum_compare(&arc_sums.arcstat_meta_used, - space) >= 0); - ARCSTAT_MAX(arcstat_meta_max, - aggsum_upper_bound(&arc_sums.arcstat_meta_used)); - aggsum_add(&arc_sums.arcstat_meta_used, -space); - } + if (type != ARC_SPACE_DATA && type != ARC_SPACE_ABD_CHUNK_WASTE) + ARCSTAT_INCR(arcstat_meta_used, -space); ASSERT(aggsum_compare(&arc_sums.arcstat_size, space) >= 0); aggsum_add(&arc_sums.arcstat_size, -space); @@ -2974,7 +2967,7 @@ arc_hdr_free_on_write(arc_buf_hdr_t *hdr, boolean_t free_rdata) (void) zfs_refcount_remove_many(&state->arcs_esize[type], size, hdr); } - (void) zfs_refcount_remove_many(&state->arcs_size, size, hdr); + (void) zfs_refcount_remove_many(&state->arcs_size[type], size, hdr); if (type == ARC_BUFC_METADATA) { arc_space_return(size, ARC_SPACE_META); } else { @@ -3007,7 +3000,8 @@ arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) * refcount ownership to the hdr since it always owns * the refcount whenever an arc_buf_t is shared. */ - zfs_refcount_transfer_ownership_many(&hdr->b_l1hdr.b_state->arcs_size, + zfs_refcount_transfer_ownership_many( + &hdr->b_l1hdr.b_state->arcs_size[arc_buf_type(hdr)], arc_hdr_size(hdr), buf, hdr); hdr->b_l1hdr.b_pabd = abd_get_from_buf(buf->b_data, arc_buf_size(buf)); abd_take_ownership_of_buf(hdr->b_l1hdr.b_pabd, @@ -3036,7 +3030,8 @@ arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) * We are no longer sharing this buffer so we need * to transfer its ownership to the rightful owner. */ - zfs_refcount_transfer_ownership_many(&hdr->b_l1hdr.b_state->arcs_size, + zfs_refcount_transfer_ownership_many( + &hdr->b_l1hdr.b_state->arcs_size[arc_buf_type(hdr)], arc_hdr_size(hdr), hdr, buf); arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd); @@ -3537,7 +3532,7 @@ arc_hdr_realloc_crypt(arc_buf_hdr_t *hdr, boolean_t need_crypt) /* unset all members of the original hdr */ memset(&hdr->b_dva, 0, sizeof (dva_t)); hdr->b_birth = 0; - hdr->b_type = ARC_BUFC_INVALID; + hdr->b_type = 0; hdr->b_flags = 0; hdr->b_psize = 0; hdr->b_lsize = 0; @@ -4195,8 +4190,7 @@ arc_state_alloc_markers(int count) /* * A b_spa of 0 is used to indicate that this header is - * a marker. This fact is used in arc_evict_type() and - * arc_evict_state_impl(). + * a marker. This fact is used in arc_evict_state_impl(). */ markers[i]->b_spa = 0; @@ -4226,8 +4220,8 @@ arc_state_free_markers(arc_buf_hdr_t **markers, int count) * the given arc state; which is used by arc_flush(). */ static uint64_t -arc_evict_state(arc_state_t *state, uint64_t spa, uint64_t bytes, - arc_buf_contents_t type) +arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa, + uint64_t bytes) { uint64_t total_evicted = 0; multilist_t *ml = &state->arcs_list[type]; @@ -4265,19 +4259,6 @@ arc_evict_state(arc_state_t *state, uint64_t spa, uint64_t bytes, int sublist_idx = multilist_get_random_index(ml); uint64_t scan_evicted = 0; - /* - * Try to reduce pinned dnodes with a floor of arc_dnode_limit. - * Request that 10% of the LRUs be scanned by the superblock - * shrinker. - */ - if (type == ARC_BUFC_DATA && aggsum_compare( - &arc_sums.arcstat_dnode_size, arc_dnode_size_limit) > 0) { - arc_prune_async((aggsum_upper_bound( - &arc_sums.arcstat_dnode_size) - - arc_dnode_size_limit) / sizeof (dnode_t) / - zfs_arc_dnode_reduce_percent); - } - /* * Start eviction using a randomly selected sublist, * this is to try and evenly balance eviction across all @@ -4362,7 +4343,7 @@ arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type, uint64_t evicted = 0; while (zfs_refcount_count(&state->arcs_esize[type]) != 0) { - evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type); + evicted += arc_evict_state(state, type, spa, ARC_EVICT_ALL); if (!retry) break; @@ -4372,252 +4353,64 @@ arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type, } /* - * Evict the specified number of bytes from the state specified, - * restricting eviction to the spa and type given. This function - * prevents us from trying to evict more from a state's list than - * is "evictable", and to skip evicting altogether when passed a + * Evict the specified number of bytes from the state specified. This + * function prevents us from trying to evict more from a state's list + * than is "evictable", and to skip evicting altogether when passed a * negative value for "bytes". In contrast, arc_evict_state() will * evict everything it can, when passed a negative value for "bytes". */ static uint64_t -arc_evict_impl(arc_state_t *state, uint64_t spa, int64_t bytes, - arc_buf_contents_t type) +arc_evict_impl(arc_state_t *state, arc_buf_contents_t type, int64_t bytes) { uint64_t delta; if (bytes > 0 && zfs_refcount_count(&state->arcs_esize[type]) > 0) { delta = MIN(zfs_refcount_count(&state->arcs_esize[type]), bytes); - return (arc_evict_state(state, spa, delta, type)); + return (arc_evict_state(state, type, 0, delta)); } return (0); } /* - * The goal of this function is to evict enough meta data buffers from the - * ARC in order to enforce the arc_meta_limit. Achieving this is slightly - * more complicated than it appears because it is common for data buffers - * to have holds on meta data buffers. In addition, dnode meta data buffers - * will be held by the dnodes in the block preventing them from being freed. - * This means we can't simply traverse the ARC and expect to always find - * enough unheld meta data buffer to release. - * - * Therefore, this function has been updated to make alternating passes - * over the ARC releasing data buffers and then newly unheld meta data - * buffers. This ensures forward progress is maintained and meta_used - * will decrease. Normally this is sufficient, but if required the ARC - * will call the registered prune callbacks causing dentry and inodes to - * be dropped from the VFS cache. This will make dnode meta data buffers - * available for reclaim. + * Adjust specified fraction, taking into account initial ghost state(s) size, + * ghost hit bytes towards increasing the fraction, ghost hit bytes towards + * decreasing it, plus a balance factor, controlling the decrease rate, used + * to balance metadata vs data. */ static uint64_t -arc_evict_meta_balanced(uint64_t meta_used) +arc_evict_adj(uint64_t frac, uint64_t total, uint64_t up, uint64_t down, + uint_t balance) { - int64_t delta, adjustmnt; - uint64_t total_evicted = 0, prune = 0; - arc_buf_contents_t type = ARC_BUFC_DATA; - uint_t restarts = zfs_arc_meta_adjust_restarts; - -restart: - /* - * This slightly differs than the way we evict from the mru in - * arc_evict because we don't have a "target" value (i.e. no - * "meta" arc_p). As a result, I think we can completely - * cannibalize the metadata in the MRU before we evict the - * metadata from the MFU. I think we probably need to implement a - * "metadata arc_p" value to do this properly. - */ - adjustmnt = meta_used - arc_meta_limit; - - if (adjustmnt > 0 && - zfs_refcount_count(&arc_mru->arcs_esize[type]) > 0) { - delta = MIN(zfs_refcount_count(&arc_mru->arcs_esize[type]), - adjustmnt); - total_evicted += arc_evict_impl(arc_mru, 0, delta, type); - adjustmnt -= delta; - } + if (total < 8 || up + down == 0) + return (frac); /* - * We can't afford to recalculate adjustmnt here. If we do, - * new metadata buffers can sneak into the MRU or ANON lists, - * thus penalize the MFU metadata. Although the fudge factor is - * small, it has been empirically shown to be significant for - * certain workloads (e.g. creating many empty directories). As - * such, we use the original calculation for adjustmnt, and - * simply decrement the amount of data evicted from the MRU. + * We should not have more ghost hits than ghost size, but they + * may get close. Restrict maximum adjustment in that case. */ - - if (adjustmnt > 0 && - zfs_refcount_count(&arc_mfu->arcs_esize[type]) > 0) { - delta = MIN(zfs_refcount_count(&arc_mfu->arcs_esize[type]), - adjustmnt); - total_evicted += arc_evict_impl(arc_mfu, 0, delta, type); + if (up + down >= total / 4) { + uint64_t scale = (up + down) / (total / 8); + up /= scale; + down /= scale; } - adjustmnt = meta_used - arc_meta_limit; + /* Get maximal dynamic range by choosing optimal shifts. */ + int s = highbit64(total); + s = MIN(64 - s, 32); - if (adjustmnt > 0 && - zfs_refcount_count(&arc_mru_ghost->arcs_esize[type]) > 0) { - delta = MIN(adjustmnt, - zfs_refcount_count(&arc_mru_ghost->arcs_esize[type])); - total_evicted += arc_evict_impl(arc_mru_ghost, 0, delta, type); - adjustmnt -= delta; - } + uint64_t ofrac = (1ULL << 32) - frac; - if (adjustmnt > 0 && - zfs_refcount_count(&arc_mfu_ghost->arcs_esize[type]) > 0) { - delta = MIN(adjustmnt, - zfs_refcount_count(&arc_mfu_ghost->arcs_esize[type])); - total_evicted += arc_evict_impl(arc_mfu_ghost, 0, delta, type); - } + if (frac >= 4 * ofrac) + up /= frac / (2 * ofrac + 1); + up = (up << s) / (total >> (32 - s)); + if (ofrac >= 4 * frac) + down /= ofrac / (2 * frac + 1); + down = (down << s) / (total >> (32 - s)); + down = down * 100 / balance; - /* - * If after attempting to make the requested adjustment to the ARC - * the meta limit is still being exceeded then request that the - * higher layers drop some cached objects which have holds on ARC - * meta buffers. Requests to the upper layers will be made with - * increasingly large scan sizes until the ARC is below the limit. - */ - if (meta_used > arc_meta_limit || arc_available_memory() < 0) { - if (type == ARC_BUFC_DATA) { - type = ARC_BUFC_METADATA; - } else { - type = ARC_BUFC_DATA; - - if (zfs_arc_meta_prune) { - prune += zfs_arc_meta_prune; - arc_prune_async(prune); - } - } - - if (restarts > 0) { - restarts--; - goto restart; - } - } - return (total_evicted); -} - -/* - * Evict metadata buffers from the cache, such that arcstat_meta_used is - * capped by the arc_meta_limit tunable. - */ -static uint64_t -arc_evict_meta_only(uint64_t meta_used) -{ - uint64_t total_evicted = 0; - int64_t target; - - /* - * If we're over the meta limit, we want to evict enough - * metadata to get back under the meta limit. We don't want to - * evict so much that we drop the MRU below arc_p, though. If - * we're over the meta limit more than we're over arc_p, we - * evict some from the MRU here, and some from the MFU below. - */ - target = MIN((int64_t)(meta_used - arc_meta_limit), - (int64_t)(zfs_refcount_count(&arc_anon->arcs_size) + - zfs_refcount_count(&arc_mru->arcs_size) - arc_p)); - - total_evicted += arc_evict_impl(arc_mru, 0, target, ARC_BUFC_METADATA); - - /* - * Similar to the above, we want to evict enough bytes to get us - * below the meta limit, but not so much as to drop us below the - * space allotted to the MFU (which is defined as arc_c - arc_p). - */ - target = MIN((int64_t)(meta_used - arc_meta_limit), - (int64_t)(zfs_refcount_count(&arc_mfu->arcs_size) - - (arc_c - arc_p))); - - total_evicted += arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); - - return (total_evicted); -} - -static uint64_t -arc_evict_meta(uint64_t meta_used) -{ - if (zfs_arc_meta_strategy == ARC_STRATEGY_META_ONLY) - return (arc_evict_meta_only(meta_used)); - else - return (arc_evict_meta_balanced(meta_used)); -} - -/* - * Return the type of the oldest buffer in the given arc state - * - * This function will select a random sublist of type ARC_BUFC_DATA and - * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist - * is compared, and the type which contains the "older" buffer will be - * returned. - */ -static arc_buf_contents_t -arc_evict_type(arc_state_t *state) -{ - multilist_t *data_ml = &state->arcs_list[ARC_BUFC_DATA]; - multilist_t *meta_ml = &state->arcs_list[ARC_BUFC_METADATA]; - int data_idx = multilist_get_random_index(data_ml); - int meta_idx = multilist_get_random_index(meta_ml); - multilist_sublist_t *data_mls; - multilist_sublist_t *meta_mls; - arc_buf_contents_t type; - arc_buf_hdr_t *data_hdr; - arc_buf_hdr_t *meta_hdr; - - /* - * We keep the sublist lock until we're finished, to prevent - * the headers from being destroyed via arc_evict_state(). - */ - data_mls = multilist_sublist_lock(data_ml, data_idx); - meta_mls = multilist_sublist_lock(meta_ml, meta_idx); - - /* - * These two loops are to ensure we skip any markers that - * might be at the tail of the lists due to arc_evict_state(). - */ - - for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL; - data_hdr = multilist_sublist_prev(data_mls, data_hdr)) { - if (data_hdr->b_spa != 0) - break; - } - - for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL; - meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) { - if (meta_hdr->b_spa != 0) - break; - } - - if (data_hdr == NULL && meta_hdr == NULL) { - type = ARC_BUFC_DATA; - } else if (data_hdr == NULL) { - ASSERT3P(meta_hdr, !=, NULL); - type = ARC_BUFC_METADATA; - } else if (meta_hdr == NULL) { - ASSERT3P(data_hdr, !=, NULL); - type = ARC_BUFC_DATA; - } else { - ASSERT3P(data_hdr, !=, NULL); - ASSERT3P(meta_hdr, !=, NULL); - - /* The headers can't be on the sublist without an L1 header */ - ASSERT(HDR_HAS_L1HDR(data_hdr)); - ASSERT(HDR_HAS_L1HDR(meta_hdr)); - - if (data_hdr->b_l1hdr.b_arc_access < - meta_hdr->b_l1hdr.b_arc_access) { - type = ARC_BUFC_DATA; - } else { - type = ARC_BUFC_METADATA; - } - } - - multilist_sublist_unlock(meta_mls); - multilist_sublist_unlock(data_mls); - - return (type); + return (frac + up - down); } /* @@ -4626,150 +4419,128 @@ arc_evict_type(arc_state_t *state) static uint64_t arc_evict(void) { - uint64_t total_evicted = 0; - uint64_t bytes; - int64_t target; - uint64_t asize = aggsum_value(&arc_sums.arcstat_size); - uint64_t ameta = aggsum_value(&arc_sums.arcstat_meta_used); + uint64_t asize, bytes, total_evicted = 0; + int64_t e, mrud, mrum, mfud, mfum, w; + static uint64_t ogrd, ogrm, ogfd, ogfm; + static uint64_t gsrd, gsrm, gsfd, gsfm; + uint64_t ngrd, ngrm, ngfd, ngfm; - /* - * If we're over arc_meta_limit, we want to correct that before - * potentially evicting data buffers below. - */ - total_evicted += arc_evict_meta(ameta); + /* Get current size of ARC states we can evict from. */ + mrud = zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_DATA]) + + zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_DATA]); + mrum = zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_METADATA]) + + zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_METADATA]); + mfud = zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_DATA]); + mfum = zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_METADATA]); + uint64_t d = mrud + mfud; + uint64_t m = mrum + mfum; + uint64_t t = d + m; - /* - * Adjust MRU size - * - * If we're over the target cache size, we want to evict enough - * from the list to get back to our target size. We don't want - * to evict too much from the MRU, such that it drops below - * arc_p. So, if we're over our target cache size more than - * the MRU is over arc_p, we'll evict enough to get back to - * arc_p here, and then evict more from the MFU below. - */ - target = MIN((int64_t)(asize - arc_c), - (int64_t)(zfs_refcount_count(&arc_anon->arcs_size) + - zfs_refcount_count(&arc_mru->arcs_size) + ameta - arc_p)); + /* Get ARC ghost hits since last eviction. */ + ngrd = wmsum_value(&arc_mru_ghost->arcs_hits[ARC_BUFC_DATA]); + uint64_t grd = ngrd - ogrd; + ogrd = ngrd; + ngrm = wmsum_value(&arc_mru_ghost->arcs_hits[ARC_BUFC_METADATA]); + uint64_t grm = ngrm - ogrm; + ogrm = ngrm; + ngfd = wmsum_value(&arc_mfu_ghost->arcs_hits[ARC_BUFC_DATA]); + uint64_t gfd = ngfd - ogfd; + ogfd = ngfd; + ngfm = wmsum_value(&arc_mfu_ghost->arcs_hits[ARC_BUFC_METADATA]); + uint64_t gfm = ngfm - ogfm; + ogfm = ngfm; - /* - * If we're below arc_meta_min, always prefer to evict data. - * Otherwise, try to satisfy the requested number of bytes to - * evict from the type which contains older buffers; in an - * effort to keep newer buffers in the cache regardless of their - * type. If we cannot satisfy the number of bytes from this - * type, spill over into the next type. - */ - if (arc_evict_type(arc_mru) == ARC_BUFC_METADATA && - ameta > arc_meta_min) { - bytes = arc_evict_impl(arc_mru, 0, target, ARC_BUFC_METADATA); - total_evicted += bytes; + /* Adjust ARC states balance based on ghost hits. */ + arc_meta = arc_evict_adj(arc_meta, gsrd + gsrm + gsfd + gsfm, + grm + gfm, grd + gfd, zfs_arc_meta_balance); + arc_pd = arc_evict_adj(arc_pd, gsrd + gsfd, grd, gfd, 100); + arc_pm = arc_evict_adj(arc_pm, gsrm + gsfm, grm, gfm, 100); - /* - * If we couldn't evict our target number of bytes from - * metadata, we try to get the rest from data. - */ - target -= bytes; - - total_evicted += - arc_evict_impl(arc_mru, 0, target, ARC_BUFC_DATA); - } else { - bytes = arc_evict_impl(arc_mru, 0, target, ARC_BUFC_DATA); - total_evicted += bytes; - - /* - * If we couldn't evict our target number of bytes from - * data, we try to get the rest from metadata. - */ - target -= bytes; - - total_evicted += - arc_evict_impl(arc_mru, 0, target, ARC_BUFC_METADATA); - } - - /* - * Re-sum ARC stats after the first round of evictions. - */ asize = aggsum_value(&arc_sums.arcstat_size); - ameta = aggsum_value(&arc_sums.arcstat_meta_used); - + int64_t wt = t - (asize - arc_c); /* - * Adjust MFU size - * - * Now that we've tried to evict enough from the MRU to get its - * size back to arc_p, if we're still above the target cache - * size, we evict the rest from the MFU. + * Try to reduce pinned dnodes if more than 3/4 of wanted metadata + * target is not evictable or if they go over arc_dnode_limit. */ - target = asize - arc_c; - - if (arc_evict_type(arc_mfu) == ARC_BUFC_METADATA && - ameta > arc_meta_min) { - bytes = arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); - total_evicted += bytes; - - /* - * If we couldn't evict our target number of bytes from - * metadata, we try to get the rest from data. - */ - target -= bytes; - - total_evicted += - arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_DATA); - } else { - bytes = arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_DATA); - total_evicted += bytes; - - /* - * If we couldn't evict our target number of bytes from - * data, we try to get the rest from data. - */ - target -= bytes; - - total_evicted += - arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); + int64_t prune = 0; + int64_t dn = wmsum_value(&arc_sums.arcstat_dnode_size); + w = wt * (arc_meta >> 16) >> 16; + if (zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_METADATA]) + + zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_METADATA]) - + zfs_refcount_count(&arc_mru->arcs_esize[ARC_BUFC_METADATA]) - + zfs_refcount_count(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]) > + w * 3 / 4) { + prune = dn / sizeof (dnode_t) * + zfs_arc_dnode_reduce_percent / 100; + } else if (dn > arc_dnode_limit) { + prune = (dn - arc_dnode_limit) / sizeof (dnode_t) * + zfs_arc_dnode_reduce_percent / 100; } + if (prune > 0) + arc_prune_async(prune); - /* - * Adjust ghost lists - * - * In addition to the above, the ARC also defines target values - * for the ghost lists. The sum of the mru list and mru ghost - * list should never exceed the target size of the cache, and - * the sum of the mru list, mfu list, mru ghost list, and mfu - * ghost list should never exceed twice the target size of the - * cache. The following logic enforces these limits on the ghost - * caches, and evicts from them as needed. - */ - target = zfs_refcount_count(&arc_mru->arcs_size) + - zfs_refcount_count(&arc_mru_ghost->arcs_size) - arc_c; + /* Evict MRU metadata. */ + w = wt * (arc_meta * arc_pm >> 48) >> 16; + e = MIN((int64_t)(asize - arc_c), (int64_t)(mrum - w)); + bytes = arc_evict_impl(arc_mru, ARC_BUFC_METADATA, e); + total_evicted += bytes; + mrum -= bytes; + asize -= bytes; - bytes = arc_evict_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA); + /* Evict MFU metadata. */ + w = wt * (arc_meta >> 16) >> 16; + e = MIN((int64_t)(asize - arc_c), (int64_t)(m - w)); + bytes = arc_evict_impl(arc_mfu, ARC_BUFC_METADATA, e); + total_evicted += bytes; + mfum -= bytes; + asize -= bytes; + + /* Evict MRU data. */ + wt -= m - total_evicted; + w = wt * (arc_pd >> 16) >> 16; + e = MIN((int64_t)(asize - arc_c), (int64_t)(mrud - w)); + bytes = arc_evict_impl(arc_mru, ARC_BUFC_DATA, e); + total_evicted += bytes; + mrud -= bytes; + asize -= bytes; + + /* Evict MFU data. */ + e = asize - arc_c; + bytes = arc_evict_impl(arc_mfu, ARC_BUFC_DATA, e); + mfud -= bytes; total_evicted += bytes; - target -= bytes; - - total_evicted += - arc_evict_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA); - /* - * We assume the sum of the mru list and mfu list is less than - * or equal to arc_c (we enforced this above), which means we - * can use the simpler of the two equations below: + * Evict ghost lists * - * mru + mfu + mru ghost + mfu ghost <= 2 * arc_c - * mru ghost + mfu ghost <= arc_c + * Size of each state's ghost list represents how much that state + * may grow by shrinking the other states. Would it need to shrink + * other states to zero (that is unlikely), its ghost size would be + * equal to sum of other three state sizes. But excessive ghost + * size may result in false ghost hits (too far back), that may + * never result in real cache hits if several states are competing. + * So choose some arbitraty point of 1/2 of other state sizes. */ - target = zfs_refcount_count(&arc_mru_ghost->arcs_size) + - zfs_refcount_count(&arc_mfu_ghost->arcs_size) - arc_c; + gsrd = (mrum + mfud + mfum) / 2; + e = zfs_refcount_count(&arc_mru_ghost->arcs_size[ARC_BUFC_DATA]) - + gsrd; + (void) arc_evict_impl(arc_mru_ghost, ARC_BUFC_DATA, e); - bytes = arc_evict_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA); - total_evicted += bytes; + gsrm = (mrud + mfud + mfum) / 2; + e = zfs_refcount_count(&arc_mru_ghost->arcs_size[ARC_BUFC_METADATA]) - + gsrm; + (void) arc_evict_impl(arc_mru_ghost, ARC_BUFC_METADATA, e); - target -= bytes; + gsfd = (mrud + mrum + mfum) / 2; + e = zfs_refcount_count(&arc_mfu_ghost->arcs_size[ARC_BUFC_DATA]) - + gsfd; + (void) arc_evict_impl(arc_mfu_ghost, ARC_BUFC_DATA, e); - total_evicted += - arc_evict_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA); + gsfm = (mrud + mrum + mfud) / 2; + e = zfs_refcount_count(&arc_mfu_ghost->arcs_size[ARC_BUFC_METADATA]) - + gsfm; + (void) arc_evict_impl(arc_mfu_ghost, ARC_BUFC_METADATA, e); return (total_evicted); } @@ -4808,7 +4579,10 @@ arc_flush(spa_t *spa, boolean_t retry) void arc_reduce_target_size(int64_t to_free) { - uint64_t asize = aggsum_value(&arc_sums.arcstat_size); + uint64_t c = arc_c; + + if (c <= arc_c_min) + return; /* * All callers want the ARC to actually evict (at least) this much @@ -4818,26 +4592,16 @@ arc_reduce_target_size(int64_t to_free) * immediately have arc_c < arc_size and therefore the arc_evict_zthr * will evict. */ - uint64_t c = MIN(arc_c, asize); + uint64_t asize = aggsum_value(&arc_sums.arcstat_size); + if (asize < c) + to_free += c - asize; + arc_c = MAX((int64_t)c - to_free, (int64_t)arc_c_min); - if (c > to_free && c - to_free > arc_c_min) { - arc_c = c - to_free; - atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); - if (arc_p > arc_c) - arc_p = (arc_c >> 1); - ASSERT(arc_c >= arc_c_min); - ASSERT((int64_t)arc_p >= 0); - } else { - arc_c = arc_c_min; - } - - if (asize > arc_c) { - /* See comment in arc_evict_cb_check() on why lock+flag */ - mutex_enter(&arc_evict_lock); - arc_evict_needed = B_TRUE; - mutex_exit(&arc_evict_lock); - zthr_wakeup(arc_evict_zthr); - } + /* See comment in arc_evict_cb_check() on why lock+flag */ + mutex_enter(&arc_evict_lock); + arc_evict_needed = B_TRUE; + mutex_exit(&arc_evict_lock); + zthr_wakeup(arc_evict_zthr); } /* @@ -4859,14 +4623,6 @@ arc_kmem_reap_soon(void) kmem_cache_t *prev_data_cache = NULL; #ifdef _KERNEL - if ((aggsum_compare(&arc_sums.arcstat_meta_used, - arc_meta_limit) >= 0) && zfs_arc_meta_prune) { - /* - * We are exceeding our meta-data cache limit. - * Prune some entries to release holds on meta-data. - */ - arc_prune_async(zfs_arc_meta_prune); - } #if defined(_ILP32) /* * Reclaim unused memory from all kmem caches. @@ -5143,40 +4899,8 @@ arc_reap_cb(void *arg, zthr_t *zthr) * when we are adding new content to the cache. */ static void -arc_adapt(int bytes, arc_state_t *state) +arc_adapt(uint64_t bytes) { - int mult; - uint64_t arc_p_min = (arc_c >> arc_p_min_shift); - int64_t mrug_size = zfs_refcount_count(&arc_mru_ghost->arcs_size); - int64_t mfug_size = zfs_refcount_count(&arc_mfu_ghost->arcs_size); - - ASSERT(bytes > 0); - /* - * Adapt the target size of the MRU list: - * - if we just hit in the MRU ghost list, then increase - * the target size of the MRU list. - * - if we just hit in the MFU ghost list, then increase - * the target size of the MFU list by decreasing the - * target size of the MRU list. - */ - if (state == arc_mru_ghost) { - mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size); - if (!zfs_arc_p_dampener_disable) - mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ - - arc_p = MIN(arc_c - arc_p_min, arc_p + (uint64_t)bytes * mult); - } else if (state == arc_mfu_ghost) { - uint64_t delta; - - mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size); - if (!zfs_arc_p_dampener_disable) - mult = MIN(mult, 10); - - delta = MIN(bytes * mult, arc_p); - arc_p = MAX(arc_p_min, arc_p - delta); - } - ASSERT((int64_t)arc_p >= 0); - /* * Wake reap thread if we do not have any available memory */ @@ -5195,18 +4919,12 @@ arc_adapt(int bytes, arc_state_t *state) * If we're within (2 * maxblocksize) bytes of the target * cache size, increment the target cache size */ - ASSERT3U(arc_c, >=, 2ULL << SPA_MAXBLOCKSHIFT); - if (aggsum_upper_bound(&arc_sums.arcstat_size) >= - arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { - atomic_add_64(&arc_c, (int64_t)bytes); - if (arc_c > arc_c_max) + if (aggsum_upper_bound(&arc_sums.arcstat_size) + + 2 * SPA_MAXBLOCKSIZE >= arc_c) { + uint64_t dc = MAX(bytes, SPA_OLD_MAXBLOCKSIZE); + if (atomic_add_64_nv(&arc_c, dc) > arc_c_max) arc_c = arc_c_max; - else if (state == arc_anon && arc_p < arc_c >> 1) - atomic_add_64(&arc_p, (int64_t)bytes); - if (arc_p > arc_c) - arc_p = arc_c; } - ASSERT((int64_t)arc_p >= 0); } /* @@ -5255,7 +4973,7 @@ arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, const void *tag) { arc_buf_contents_t type = arc_buf_type(hdr); - arc_get_data_impl(hdr, size, tag, ARC_HDR_DO_ADAPT); + arc_get_data_impl(hdr, size, tag, 0); if (type == ARC_BUFC_METADATA) { return (zio_buf_alloc(size)); } else { @@ -5353,11 +5071,7 @@ static void arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, const void *tag, int alloc_flags) { - arc_state_t *state = hdr->b_l1hdr.b_state; - arc_buf_contents_t type = arc_buf_type(hdr); - - if (alloc_flags & ARC_HDR_DO_ADAPT) - arc_adapt(size, state); + arc_adapt(size); /* * If arc_size is currently overflowing, we must be adding data @@ -5375,7 +5089,7 @@ arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, const void *tag, arc_wait_for_eviction(size * zfs_arc_eviction_pct / 100, alloc_flags & ARC_HDR_USE_RESERVE); - VERIFY3U(hdr->b_type, ==, type); + arc_buf_contents_t type = arc_buf_type(hdr); if (type == ARC_BUFC_METADATA) { arc_space_consume(size, ARC_SPACE_META); } else { @@ -5386,9 +5100,11 @@ arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, const void *tag, * Update the state size. Note that ghost states have a * "ghost size" and so don't need to be updated. */ + arc_state_t *state = hdr->b_l1hdr.b_state; if (!GHOST_STATE(state)) { - (void) zfs_refcount_add_many(&state->arcs_size, size, tag); + (void) zfs_refcount_add_many(&state->arcs_size[type], size, + tag); /* * If this is reached via arc_read, the link is @@ -5404,17 +5120,6 @@ arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, const void *tag, (void) zfs_refcount_add_many(&state->arcs_esize[type], size, tag); } - - /* - * If we are growing the cache, and we are adding anonymous - * data, and we have outgrown arc_p, update arc_p - */ - if (aggsum_upper_bound(&arc_sums.arcstat_size) < arc_c && - hdr->b_l1hdr.b_state == arc_anon && - (zfs_refcount_count(&arc_anon->arcs_size) + - zfs_refcount_count(&arc_mru->arcs_size) > arc_p && - arc_p < arc_c >> 1)) - arc_p = MIN(arc_c, arc_p + size); } } @@ -5457,7 +5162,7 @@ arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, const void *tag) (void) zfs_refcount_remove_many(&state->arcs_esize[type], size, tag); } - (void) zfs_refcount_remove_many(&state->arcs_size, size, tag); + (void) zfs_refcount_remove_many(&state->arcs_size[type], size, tag); VERIFY3U(hdr->b_type, ==, type); if (type == ARC_BUFC_METADATA) { @@ -5570,6 +5275,8 @@ arc_access(arc_buf_hdr_t *hdr, arc_flags_t arc_flags, boolean_t hit) hdr->b_l1hdr.b_mru_ghost_hits++; ARCSTAT_BUMP(arcstat_mru_ghost_hits); hdr->b_l1hdr.b_arc_access = now; + wmsum_add(&arc_mru_ghost->arcs_hits[arc_buf_type(hdr)], + arc_hdr_size(hdr)); if (was_prefetch) { new_state = arc_mru; DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); @@ -5597,6 +5304,8 @@ arc_access(arc_buf_hdr_t *hdr, arc_flags_t arc_flags, boolean_t hit) hdr->b_l1hdr.b_mfu_ghost_hits++; ARCSTAT_BUMP(arcstat_mfu_ghost_hits); hdr->b_l1hdr.b_arc_access = now; + wmsum_add(&arc_mfu_ghost->arcs_hits[arc_buf_type(hdr)], + arc_hdr_size(hdr)); DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); arc_change_state(arc_mfu, hdr); } else if (hdr->b_l1hdr.b_state == arc_uncached) { @@ -6156,6 +5865,7 @@ top: uint64_t size; abd_t *hdr_abd; int alloc_flags = encrypted_read ? ARC_HDR_ALLOC_RDATA : 0; + arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); if (*arc_flags & ARC_FLAG_CACHED_ONLY) { if (hash_lock != NULL) @@ -6170,7 +5880,6 @@ top: * embedded data. */ arc_buf_hdr_t *exists = NULL; - arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, BP_IS_PROTECTED(bp), BP_GET_COMPRESS(bp), 0, type); @@ -6229,11 +5938,6 @@ top: alloc_flags |= ARC_HDR_ALLOC_LINEAR; } - /* - * Call arc_adapt() explicitly before arc_access() to allow - * its logic to balance MRU/MFU based on the original state. - */ - arc_adapt(arc_hdr_size(hdr), hdr->b_l1hdr.b_state); /* * Take additional reference for IO_IN_PROGRESS. It stops * arc_access() from putting this header without any buffers @@ -6706,7 +6410,7 @@ arc_release(arc_buf_t *buf, const void *tag) if (arc_can_share(hdr, lastbuf)) { arc_share_buf(hdr, lastbuf); } else { - arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT); + arc_hdr_alloc_abd(hdr, 0); abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data, psize); } @@ -6728,7 +6432,7 @@ arc_release(arc_buf_t *buf, const void *tag) ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); ASSERT3P(state, !=, arc_l2c_only); - (void) zfs_refcount_remove_many(&state->arcs_size, + (void) zfs_refcount_remove_many(&state->arcs_size[type], arc_buf_size(buf), buf); if (zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { @@ -6766,7 +6470,7 @@ arc_release(arc_buf_t *buf, const void *tag) (void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, tag); buf->b_hdr = nhdr; - (void) zfs_refcount_add_many(&arc_anon->arcs_size, + (void) zfs_refcount_add_many(&arc_anon->arcs_size[type], arc_buf_size(buf), buf); } else { ASSERT(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 1); @@ -6922,7 +6626,7 @@ arc_write_ready(zio_t *zio) if (ARC_BUF_ENCRYPTED(buf)) { ASSERT3U(psize, >, 0); ASSERT(ARC_BUF_COMPRESSED(buf)); - arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT | ARC_HDR_ALLOC_RDATA | + arc_hdr_alloc_abd(hdr, ARC_HDR_ALLOC_RDATA | ARC_HDR_USE_RESERVE); abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize); } else if (!(HDR_UNCACHED(hdr) || @@ -6935,19 +6639,17 @@ arc_write_ready(zio_t *zio) */ if (BP_IS_ENCRYPTED(bp)) { ASSERT3U(psize, >, 0); - arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT | - ARC_HDR_ALLOC_RDATA | ARC_HDR_USE_RESERVE); + arc_hdr_alloc_abd(hdr, ARC_HDR_ALLOC_RDATA | + ARC_HDR_USE_RESERVE); abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize); } else if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF && !ARC_BUF_COMPRESSED(buf)) { ASSERT3U(psize, >, 0); - arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT | - ARC_HDR_USE_RESERVE); + arc_hdr_alloc_abd(hdr, ARC_HDR_USE_RESERVE); abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize); } else { ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr)); - arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT | - ARC_HDR_USE_RESERVE); + arc_hdr_alloc_abd(hdr, ARC_HDR_USE_RESERVE); abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data, arc_buf_size(buf)); } @@ -7202,7 +6904,9 @@ arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg) /* assert that it has not wrapped around */ ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0); - anon_size = MAX((int64_t)(zfs_refcount_count(&arc_anon->arcs_size) - + anon_size = MAX((int64_t) + (zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_DATA]) + + zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_METADATA]) - arc_loaned_bytes), 0); /* @@ -7258,9 +6962,14 @@ arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg) static void arc_kstat_update_state(arc_state_t *state, kstat_named_t *size, + kstat_named_t *data, kstat_named_t *metadata, kstat_named_t *evict_data, kstat_named_t *evict_metadata) { - size->value.ui64 = zfs_refcount_count(&state->arcs_size); + data->value.ui64 = + zfs_refcount_count(&state->arcs_size[ARC_BUFC_DATA]); + metadata->value.ui64 = + zfs_refcount_count(&state->arcs_size[ARC_BUFC_METADATA]); + size->value.ui64 = data->value.ui64 + metadata->value.ui64; evict_data->value.ui64 = zfs_refcount_count(&state->arcs_esize[ARC_BUFC_DATA]); evict_metadata->value.ui64 = @@ -7360,37 +7069,49 @@ arc_kstat_update(kstat_t *ksp, int rw) #if defined(COMPAT_FREEBSD11) as->arcstat_other_size.value.ui64 = wmsum_value(&arc_sums.arcstat_bonus_size) + - aggsum_value(&arc_sums.arcstat_dnode_size) + + wmsum_value(&arc_sums.arcstat_dnode_size) + wmsum_value(&arc_sums.arcstat_dbuf_size); #endif arc_kstat_update_state(arc_anon, &as->arcstat_anon_size, + &as->arcstat_anon_data, + &as->arcstat_anon_metadata, &as->arcstat_anon_evictable_data, &as->arcstat_anon_evictable_metadata); arc_kstat_update_state(arc_mru, &as->arcstat_mru_size, + &as->arcstat_mru_data, + &as->arcstat_mru_metadata, &as->arcstat_mru_evictable_data, &as->arcstat_mru_evictable_metadata); arc_kstat_update_state(arc_mru_ghost, &as->arcstat_mru_ghost_size, + &as->arcstat_mru_ghost_data, + &as->arcstat_mru_ghost_metadata, &as->arcstat_mru_ghost_evictable_data, &as->arcstat_mru_ghost_evictable_metadata); arc_kstat_update_state(arc_mfu, &as->arcstat_mfu_size, + &as->arcstat_mfu_data, + &as->arcstat_mfu_metadata, &as->arcstat_mfu_evictable_data, &as->arcstat_mfu_evictable_metadata); arc_kstat_update_state(arc_mfu_ghost, &as->arcstat_mfu_ghost_size, + &as->arcstat_mfu_ghost_data, + &as->arcstat_mfu_ghost_metadata, &as->arcstat_mfu_ghost_evictable_data, &as->arcstat_mfu_ghost_evictable_metadata); arc_kstat_update_state(arc_uncached, &as->arcstat_uncached_size, + &as->arcstat_uncached_data, + &as->arcstat_uncached_metadata, &as->arcstat_uncached_evictable_data, &as->arcstat_uncached_evictable_metadata); as->arcstat_dnode_size.value.ui64 = - aggsum_value(&arc_sums.arcstat_dnode_size); + wmsum_value(&arc_sums.arcstat_dnode_size); as->arcstat_bonus_size.value.ui64 = wmsum_value(&arc_sums.arcstat_bonus_size); as->arcstat_l2_hits.value.ui64 = @@ -7488,7 +7209,7 @@ arc_kstat_update(kstat_t *ksp, int rw) as->arcstat_prune.value.ui64 = wmsum_value(&arc_sums.arcstat_prune); as->arcstat_meta_used.value.ui64 = - aggsum_value(&arc_sums.arcstat_meta_used); + wmsum_value(&arc_sums.arcstat_meta_used); as->arcstat_async_upgrade_sync.value.ui64 = wmsum_value(&arc_sums.arcstat_async_upgrade_sync); as->arcstat_predictive_prefetch.value.ui64 = @@ -7574,7 +7295,6 @@ void arc_tuning_update(boolean_t verbose) { uint64_t allmem = arc_all_memory(); - unsigned long limit; /* Valid range: 32M - */ if ((zfs_arc_min) && (zfs_arc_min != arc_c_min) && @@ -7591,44 +7311,15 @@ arc_tuning_update(boolean_t verbose) (zfs_arc_max > arc_c_min)) { arc_c_max = zfs_arc_max; arc_c = MIN(arc_c, arc_c_max); - arc_p = (arc_c >> 1); - if (arc_meta_limit > arc_c_max) - arc_meta_limit = arc_c_max; - if (arc_dnode_size_limit > arc_meta_limit) - arc_dnode_size_limit = arc_meta_limit; + if (arc_dnode_limit > arc_c_max) + arc_dnode_limit = arc_c_max; } WARN_IF_TUNING_IGNORED(zfs_arc_max, arc_c_max, verbose); - /* Valid range: 16M - */ - if ((zfs_arc_meta_min) && (zfs_arc_meta_min != arc_meta_min) && - (zfs_arc_meta_min >= 1ULL << SPA_MAXBLOCKSHIFT) && - (zfs_arc_meta_min <= arc_c_max)) { - arc_meta_min = zfs_arc_meta_min; - if (arc_meta_limit < arc_meta_min) - arc_meta_limit = arc_meta_min; - if (arc_dnode_size_limit < arc_meta_min) - arc_dnode_size_limit = arc_meta_min; - } - WARN_IF_TUNING_IGNORED(zfs_arc_meta_min, arc_meta_min, verbose); - - /* Valid range: - */ - limit = zfs_arc_meta_limit ? zfs_arc_meta_limit : - MIN(zfs_arc_meta_limit_percent, 100) * arc_c_max / 100; - if ((limit != arc_meta_limit) && - (limit >= arc_meta_min) && - (limit <= arc_c_max)) - arc_meta_limit = limit; - WARN_IF_TUNING_IGNORED(zfs_arc_meta_limit, arc_meta_limit, verbose); - - /* Valid range: - */ - limit = zfs_arc_dnode_limit ? zfs_arc_dnode_limit : - MIN(zfs_arc_dnode_limit_percent, 100) * arc_meta_limit / 100; - if ((limit != arc_dnode_size_limit) && - (limit >= arc_meta_min) && - (limit <= arc_meta_limit)) - arc_dnode_size_limit = limit; - WARN_IF_TUNING_IGNORED(zfs_arc_dnode_limit, arc_dnode_size_limit, - verbose); + /* Valid range: 0 - */ + arc_dnode_limit = zfs_arc_dnode_limit ? zfs_arc_dnode_limit : + MIN(zfs_arc_dnode_limit_percent, 100) * arc_c_max / 100; + WARN_IF_TUNING_IGNORED(zfs_arc_dnode_limit, arc_dnode_limit, verbose); /* Valid range: 1 - N */ if (zfs_arc_grow_retry) @@ -7640,10 +7331,6 @@ arc_tuning_update(boolean_t verbose) arc_no_grow_shift = MIN(arc_no_grow_shift, arc_shrink_shift -1); } - /* Valid range: 1 - N */ - if (zfs_arc_p_min_shift) - arc_p_min_shift = zfs_arc_p_min_shift; - /* Valid range: 1 - N ms */ if (zfs_arc_min_prefetch_ms) arc_min_prefetch_ms = zfs_arc_min_prefetch_ms; @@ -7732,13 +7419,25 @@ arc_state_init(void) zfs_refcount_create(&arc_uncached->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_uncached->arcs_esize[ARC_BUFC_DATA]); - zfs_refcount_create(&arc_anon->arcs_size); - zfs_refcount_create(&arc_mru->arcs_size); - zfs_refcount_create(&arc_mru_ghost->arcs_size); - zfs_refcount_create(&arc_mfu->arcs_size); - zfs_refcount_create(&arc_mfu_ghost->arcs_size); - zfs_refcount_create(&arc_l2c_only->arcs_size); - zfs_refcount_create(&arc_uncached->arcs_size); + zfs_refcount_create(&arc_anon->arcs_size[ARC_BUFC_DATA]); + zfs_refcount_create(&arc_anon->arcs_size[ARC_BUFC_METADATA]); + zfs_refcount_create(&arc_mru->arcs_size[ARC_BUFC_DATA]); + zfs_refcount_create(&arc_mru->arcs_size[ARC_BUFC_METADATA]); + zfs_refcount_create(&arc_mru_ghost->arcs_size[ARC_BUFC_DATA]); + zfs_refcount_create(&arc_mru_ghost->arcs_size[ARC_BUFC_METADATA]); + zfs_refcount_create(&arc_mfu->arcs_size[ARC_BUFC_DATA]); + zfs_refcount_create(&arc_mfu->arcs_size[ARC_BUFC_METADATA]); + zfs_refcount_create(&arc_mfu_ghost->arcs_size[ARC_BUFC_DATA]); + zfs_refcount_create(&arc_mfu_ghost->arcs_size[ARC_BUFC_METADATA]); + zfs_refcount_create(&arc_l2c_only->arcs_size[ARC_BUFC_DATA]); + zfs_refcount_create(&arc_l2c_only->arcs_size[ARC_BUFC_METADATA]); + zfs_refcount_create(&arc_uncached->arcs_size[ARC_BUFC_DATA]); + zfs_refcount_create(&arc_uncached->arcs_size[ARC_BUFC_METADATA]); + + wmsum_init(&arc_mru_ghost->arcs_hits[ARC_BUFC_DATA], 0); + wmsum_init(&arc_mru_ghost->arcs_hits[ARC_BUFC_METADATA], 0); + wmsum_init(&arc_mfu_ghost->arcs_hits[ARC_BUFC_DATA], 0); + wmsum_init(&arc_mfu_ghost->arcs_hits[ARC_BUFC_METADATA], 0); wmsum_init(&arc_sums.arcstat_hits, 0); wmsum_init(&arc_sums.arcstat_iohits, 0); @@ -7781,7 +7480,7 @@ arc_state_init(void) wmsum_init(&arc_sums.arcstat_data_size, 0); wmsum_init(&arc_sums.arcstat_metadata_size, 0); wmsum_init(&arc_sums.arcstat_dbuf_size, 0); - aggsum_init(&arc_sums.arcstat_dnode_size, 0); + wmsum_init(&arc_sums.arcstat_dnode_size, 0); wmsum_init(&arc_sums.arcstat_bonus_size, 0); wmsum_init(&arc_sums.arcstat_l2_hits, 0); wmsum_init(&arc_sums.arcstat_l2_misses, 0); @@ -7826,7 +7525,7 @@ arc_state_init(void) wmsum_init(&arc_sums.arcstat_memory_direct_count, 0); wmsum_init(&arc_sums.arcstat_memory_indirect_count, 0); wmsum_init(&arc_sums.arcstat_prune, 0); - aggsum_init(&arc_sums.arcstat_meta_used, 0); + wmsum_init(&arc_sums.arcstat_meta_used, 0); wmsum_init(&arc_sums.arcstat_async_upgrade_sync, 0); wmsum_init(&arc_sums.arcstat_predictive_prefetch, 0); wmsum_init(&arc_sums.arcstat_demand_hit_predictive_prefetch, 0); @@ -7865,13 +7564,20 @@ arc_state_fini(void) zfs_refcount_destroy(&arc_uncached->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_uncached->arcs_esize[ARC_BUFC_DATA]); - zfs_refcount_destroy(&arc_anon->arcs_size); - zfs_refcount_destroy(&arc_mru->arcs_size); - zfs_refcount_destroy(&arc_mru_ghost->arcs_size); - zfs_refcount_destroy(&arc_mfu->arcs_size); - zfs_refcount_destroy(&arc_mfu_ghost->arcs_size); - zfs_refcount_destroy(&arc_l2c_only->arcs_size); - zfs_refcount_destroy(&arc_uncached->arcs_size); + zfs_refcount_destroy(&arc_anon->arcs_size[ARC_BUFC_DATA]); + zfs_refcount_destroy(&arc_anon->arcs_size[ARC_BUFC_METADATA]); + zfs_refcount_destroy(&arc_mru->arcs_size[ARC_BUFC_DATA]); + zfs_refcount_destroy(&arc_mru->arcs_size[ARC_BUFC_METADATA]); + zfs_refcount_destroy(&arc_mru_ghost->arcs_size[ARC_BUFC_DATA]); + zfs_refcount_destroy(&arc_mru_ghost->arcs_size[ARC_BUFC_METADATA]); + zfs_refcount_destroy(&arc_mfu->arcs_size[ARC_BUFC_DATA]); + zfs_refcount_destroy(&arc_mfu->arcs_size[ARC_BUFC_METADATA]); + zfs_refcount_destroy(&arc_mfu_ghost->arcs_size[ARC_BUFC_DATA]); + zfs_refcount_destroy(&arc_mfu_ghost->arcs_size[ARC_BUFC_METADATA]); + zfs_refcount_destroy(&arc_l2c_only->arcs_size[ARC_BUFC_DATA]); + zfs_refcount_destroy(&arc_l2c_only->arcs_size[ARC_BUFC_METADATA]); + zfs_refcount_destroy(&arc_uncached->arcs_size[ARC_BUFC_DATA]); + zfs_refcount_destroy(&arc_uncached->arcs_size[ARC_BUFC_METADATA]); multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); @@ -7886,6 +7592,11 @@ arc_state_fini(void) multilist_destroy(&arc_uncached->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(&arc_uncached->arcs_list[ARC_BUFC_DATA]); + wmsum_fini(&arc_mru_ghost->arcs_hits[ARC_BUFC_DATA]); + wmsum_fini(&arc_mru_ghost->arcs_hits[ARC_BUFC_METADATA]); + wmsum_fini(&arc_mfu_ghost->arcs_hits[ARC_BUFC_DATA]); + wmsum_fini(&arc_mfu_ghost->arcs_hits[ARC_BUFC_METADATA]); + wmsum_fini(&arc_sums.arcstat_hits); wmsum_fini(&arc_sums.arcstat_iohits); wmsum_fini(&arc_sums.arcstat_misses); @@ -7927,7 +7638,7 @@ arc_state_fini(void) wmsum_fini(&arc_sums.arcstat_data_size); wmsum_fini(&arc_sums.arcstat_metadata_size); wmsum_fini(&arc_sums.arcstat_dbuf_size); - aggsum_fini(&arc_sums.arcstat_dnode_size); + wmsum_fini(&arc_sums.arcstat_dnode_size); wmsum_fini(&arc_sums.arcstat_bonus_size); wmsum_fini(&arc_sums.arcstat_l2_hits); wmsum_fini(&arc_sums.arcstat_l2_misses); @@ -7972,7 +7683,7 @@ arc_state_fini(void) wmsum_fini(&arc_sums.arcstat_memory_direct_count); wmsum_fini(&arc_sums.arcstat_memory_indirect_count); wmsum_fini(&arc_sums.arcstat_prune); - aggsum_fini(&arc_sums.arcstat_meta_used); + wmsum_fini(&arc_sums.arcstat_meta_used); wmsum_fini(&arc_sums.arcstat_async_upgrade_sync); wmsum_fini(&arc_sums.arcstat_predictive_prefetch); wmsum_fini(&arc_sums.arcstat_demand_hit_predictive_prefetch); @@ -8044,18 +7755,16 @@ arc_init(void) #endif arc_c = arc_c_min; - arc_p = (arc_c >> 1); - - /* Set min to 1/2 of arc_c_min */ - arc_meta_min = 1ULL << SPA_MAXBLOCKSHIFT; /* - * Set arc_meta_limit to a percent of arc_c_max with a floor of - * arc_meta_min, and a ceiling of arc_c_max. + * 32-bit fixed point fractions of metadata from total ARC size, + * MRU data from all data and MRU metadata from all metadata. */ - percent = MIN(zfs_arc_meta_limit_percent, 100); - arc_meta_limit = MAX(arc_meta_min, (percent * arc_c_max) / 100); + arc_meta = (1ULL << 32) / 4; /* Metadata is 25% of arc_c. */ + arc_pd = (1ULL << 32) / 2; /* Data MRU is 50% of data. */ + arc_pm = (1ULL << 32) / 2; /* Metadata MRU is 50% of metadata. */ + percent = MIN(zfs_arc_dnode_limit_percent, 100); - arc_dnode_size_limit = (percent * arc_meta_limit) / 100; + arc_dnode_limit = arc_c_max * percent / 100; /* Apply user specified tunings */ arc_tuning_update(B_TRUE); @@ -8832,7 +8541,7 @@ l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb) */ if (BP_IS_ENCRYPTED(bp)) { abd_t *eabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, - ARC_HDR_DO_ADAPT | ARC_HDR_USE_RESERVE); + ARC_HDR_USE_RESERVE); zio_crypt_decode_params_bp(bp, salt, iv); zio_crypt_decode_mac_bp(bp, mac); @@ -8869,7 +8578,7 @@ l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb) if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) { abd_t *cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, - ARC_HDR_DO_ADAPT | ARC_HDR_USE_RESERVE); + ARC_HDR_USE_RESERVE); void *tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr)); ret = zio_decompress_data(HDR_GET_COMPRESS(hdr), @@ -9763,7 +9472,7 @@ l2arc_hdr_limit_reached(void) { int64_t s = aggsum_upper_bound(&arc_sums.arcstat_l2_hdr_size); - return (arc_reclaim_needed() || (s > arc_meta_limit * 3 / 4) || + return (arc_reclaim_needed() || (s > (arc_warm ? arc_c : arc_c_max) * l2arc_meta_percent / 100)); } @@ -10653,7 +10362,7 @@ l2arc_log_blk_restore(l2arc_dev_t *dev, const l2arc_log_blk_phys_t *lb, * since we may allocate significant amount of memory here, let ARC * grow its arc_c. */ - arc_adapt(log_entries * HDR_L2ONLY_SIZE, arc_l2c_only); + arc_adapt(log_entries * HDR_L2ONLY_SIZE); for (int i = log_entries - 1; i >= 0; i--) { /* @@ -11113,40 +10822,18 @@ ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min, param_set_arc_min, ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, max, param_set_arc_max, spl_param_get_u64, ZMOD_RW, "Maximum ARC size in bytes"); -ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_limit, param_set_arc_u64, - spl_param_get_u64, ZMOD_RW, "Metadata limit for ARC size in bytes"); - -ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_limit_percent, - param_set_arc_int, param_get_uint, ZMOD_RW, - "Percent of ARC size for ARC meta limit"); - -ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_min, param_set_arc_u64, - spl_param_get_u64, ZMOD_RW, "Minimum ARC metadata size in bytes"); - -ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_prune, INT, ZMOD_RW, - "Meta objects to scan for prune"); - -ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_adjust_restarts, UINT, ZMOD_RW, - "Limit number of restarts in arc_evict_meta"); - -ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_strategy, UINT, ZMOD_RW, - "Meta reclaim strategy"); +ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_balance, UINT, ZMOD_RW, + "Balance between metadata and data on ghost hits."); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, grow_retry, param_set_arc_int, param_get_uint, ZMOD_RW, "Seconds before growing ARC size"); -ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, p_dampener_disable, INT, ZMOD_RW, - "Disable arc_p adapt dampener"); - ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, shrink_shift, param_set_arc_int, param_get_uint, ZMOD_RW, "log2(fraction of ARC to reclaim)"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, pc_percent, UINT, ZMOD_RW, "Percent of pagecache to reclaim ARC to"); -ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, p_min_shift, param_set_arc_int, - param_get_uint, ZMOD_RW, "arc_c shift to calc min/max arc_p"); - ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, average_blocksize, UINT, ZMOD_RD, "Target average block size"); diff --git a/tests/zfs-tests/tests/perf/perf.shlib b/tests/zfs-tests/tests/perf/perf.shlib index 27c40bd529..5555e910d7 100644 --- a/tests/zfs-tests/tests/perf/perf.shlib +++ b/tests/zfs-tests/tests/perf/perf.shlib @@ -485,7 +485,6 @@ function get_system_config printf " \"tunables\": {\n" >>$config for tunable in \ zfs_arc_max \ - zfs_arc_meta_limit \ zfs_arc_sys_free \ zfs_dirty_data_max \ zfs_flags \