diff --git a/cmd/dbufstat.in b/cmd/dbufstat.in index 08c22864e5..1252496577 100755 --- a/cmd/dbufstat.in +++ b/cmd/dbufstat.in @@ -37,7 +37,7 @@ import re bhdr = ["pool", "objset", "object", "level", "blkid", "offset", "dbsize"] bxhdr = ["pool", "objset", "object", "level", "blkid", "offset", "dbsize", - "meta", "state", "dbholds", "dbc", "list", "atype", "flags", + "usize", "meta", "state", "dbholds", "dbc", "list", "atype", "flags", "count", "asize", "access", "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", "l2_asize", "l2_comp", "aholds", "dtype", "btype", "data_bs", "meta_bs", "bsize", "lvls", "dholds", "blocks", "dsize"] @@ -47,17 +47,17 @@ dhdr = ["pool", "objset", "object", "dtype", "cached"] dxhdr = ["pool", "objset", "object", "dtype", "btype", "data_bs", "meta_bs", "bsize", "lvls", "dholds", "blocks", "dsize", "cached", "direct", "indirect", "bonus", "spill"] -dincompat = ["level", "blkid", "offset", "dbsize", "meta", "state", "dbholds", - "dbc", "list", "atype", "flags", "count", "asize", "access", - "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", "l2_asize", - "l2_comp", "aholds"] +dincompat = ["level", "blkid", "offset", "dbsize", "usize", "meta", "state", + "dbholds", "dbc", "list", "atype", "flags", "count", "asize", + "access", "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", + "l2_asize", "l2_comp", "aholds"] thdr = ["pool", "objset", "dtype", "cached"] txhdr = ["pool", "objset", "dtype", "cached", "direct", "indirect", "bonus", "spill"] -tincompat = ["object", "level", "blkid", "offset", "dbsize", "meta", "state", - "dbc", "dbholds", "list", "atype", "flags", "count", "asize", - "access", "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", +tincompat = ["object", "level", "blkid", "offset", "dbsize", "usize", "meta", + "state", "dbc", "dbholds", "list", "atype", "flags", "count", + "asize", "access", "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", "l2_asize", "l2_comp", "aholds", "btype", "data_bs", "meta_bs", "bsize", "lvls", "dholds", "blocks", "dsize"] @@ -70,6 +70,7 @@ cols = { "blkid": [8, -1, "block number of buffer"], "offset": [12, 1024, "offset in object of buffer"], "dbsize": [7, 1024, "size of buffer"], + "usize": [7, 1024, "size of attached user data"], "meta": [4, -1, "is this buffer metadata?"], "state": [5, -1, "state of buffer (read, cached, etc)"], "dbholds": [7, 1000, "number of holds on buffer"], @@ -399,6 +400,7 @@ def update_dict(d, k, line, labels): key = line[labels[k]] dbsize = int(line[labels['dbsize']]) + usize = int(line[labels['usize']]) blkid = int(line[labels['blkid']]) level = int(line[labels['level']]) @@ -416,7 +418,7 @@ def update_dict(d, k, line, labels): d[pool][objset][key]['indirect'] = 0 d[pool][objset][key]['spill'] = 0 - d[pool][objset][key]['cached'] += dbsize + d[pool][objset][key]['cached'] += dbsize + usize if blkid == -1: d[pool][objset][key]['bonus'] += dbsize diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 1cc8b8971a..bddf395df7 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -652,6 +652,9 @@ typedef struct dmu_buf_user { */ taskq_ent_t dbu_tqent; + /* Size of user data, for inclusion in dbuf_cache accounting. */ + uint64_t dbu_size; + /* * This instance's eviction function pointers. * @@ -733,6 +736,16 @@ void *dmu_buf_replace_user(dmu_buf_t *db, */ void *dmu_buf_remove_user(dmu_buf_t *db, dmu_buf_user_t *user); +/* + * User data size accounting. This can be used to artifically inflate the size + * of the dbuf during cache accounting, so that dbuf_evict_thread evicts enough + * to satisfy memory reclaim requests. It's not used for anything else, and + * defaults to 0. + */ +uint64_t dmu_buf_user_size(dmu_buf_t *db); +void dmu_buf_add_user_size(dmu_buf_t *db, uint64_t nadd); +void dmu_buf_sub_user_size(dmu_buf_t *db, uint64_t nsub); + /* * Returns the user data (dmu_buf_user_t *) associated with this dbuf. */ diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 0a179fffb1..5cd97b9fac 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -569,6 +569,21 @@ dbuf_evict_user(dmu_buf_impl_t *db) *dbu->dbu_clear_on_evict_dbufp = NULL; #endif + if (db->db_caching_status != DB_NO_CACHE) { + /* + * This is a cached dbuf, so the size of the user data is + * included in its cached amount. We adjust it here because the + * user data has already been detached from the dbuf, and the + * sync functions are not supposed to touch it (the dbuf might + * not exist anymore by the time the sync functions run. + */ + uint64_t size = dbu->dbu_size; + (void) zfs_refcount_remove_many( + &dbuf_caches[db->db_caching_status].size, size, db); + if (db->db_caching_status == DB_DBUF_CACHE) + DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size); + } + /* * There are two eviction callbacks - one that we call synchronously * and one that we invoke via a taskq. The async one is useful for @@ -770,12 +785,12 @@ dbuf_evict_one(void) if (db != NULL) { multilist_sublist_remove(mls, db); multilist_sublist_unlock(mls); + uint64_t size = db->db.db_size + dmu_buf_user_size(&db->db); (void) zfs_refcount_remove_many( - &dbuf_caches[DB_DBUF_CACHE].size, db->db.db_size, db); + &dbuf_caches[DB_DBUF_CACHE].size, size, db); DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]); DBUF_STAT_BUMPDOWN(cache_count); - DBUF_STAT_DECR(cache_levels_bytes[db->db_level], - db->db.db_size); + DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size); ASSERT3U(db->db_caching_status, ==, DB_DBUF_CACHE); db->db_caching_status = DB_NO_CACHE; dbuf_destroy(db); @@ -3002,6 +3017,8 @@ dbuf_destroy(dmu_buf_impl_t *db) db->db_caching_status == DB_DBUF_METADATA_CACHE); multilist_remove(&dbuf_caches[db->db_caching_status].cache, db); + + ASSERT0(dmu_buf_user_size(&db->db)); (void) zfs_refcount_remove_many( &dbuf_caches[db->db_caching_status].size, db->db.db_size, db); @@ -3749,17 +3766,17 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, db->db_caching_status == DB_DBUF_METADATA_CACHE); multilist_remove(&dbuf_caches[db->db_caching_status].cache, db); + + uint64_t size = db->db.db_size + dmu_buf_user_size(&db->db); (void) zfs_refcount_remove_many( - &dbuf_caches[db->db_caching_status].size, - db->db.db_size, db); + &dbuf_caches[db->db_caching_status].size, size, db); if (db->db_caching_status == DB_DBUF_METADATA_CACHE) { DBUF_STAT_BUMPDOWN(metadata_cache_count); } else { DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]); DBUF_STAT_BUMPDOWN(cache_count); - DBUF_STAT_DECR(cache_levels_bytes[db->db_level], - db->db.db_size); + DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size); } db->db_caching_status = DB_NO_CACHE; } @@ -3978,7 +3995,8 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, const void *tag, boolean_t evicting) db->db_caching_status = dcs; multilist_insert(&dbuf_caches[dcs].cache, db); - uint64_t db_size = db->db.db_size; + uint64_t db_size = db->db.db_size + + dmu_buf_user_size(&db->db); size = zfs_refcount_add_many( &dbuf_caches[dcs].size, db_size, db); uint8_t db_level = db->db_level; @@ -4074,6 +4092,35 @@ dmu_buf_get_user(dmu_buf_t *db_fake) return (db->db_user); } +uint64_t +dmu_buf_user_size(dmu_buf_t *db_fake) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + if (db->db_user == NULL) + return (0); + return (atomic_load_64(&db->db_user->dbu_size)); +} + +void +dmu_buf_add_user_size(dmu_buf_t *db_fake, uint64_t nadd) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE); + ASSERT3P(db->db_user, !=, NULL); + ASSERT3U(atomic_load_64(&db->db_user->dbu_size), <, UINT64_MAX - nadd); + atomic_add_64(&db->db_user->dbu_size, nadd); +} + +void +dmu_buf_sub_user_size(dmu_buf_t *db_fake, uint64_t nsub) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE); + ASSERT3P(db->db_user, !=, NULL); + ASSERT3U(atomic_load_64(&db->db_user->dbu_size), >=, nsub); + atomic_sub_64(&db->db_user->dbu_size, nsub); +} + void dmu_buf_user_evict_wait(void) { diff --git a/module/zfs/dbuf_stats.c b/module/zfs/dbuf_stats.c index e5dc2df307..ccee8997e1 100644 --- a/module/zfs/dbuf_stats.c +++ b/module/zfs/dbuf_stats.c @@ -46,14 +46,14 @@ static int dbuf_stats_hash_table_headers(char *buf, size_t size) { (void) snprintf(buf, size, - "%-96s | %-119s | %s\n" - "%-16s %-8s %-8s %-8s %-8s %-10s %-8s %-5s %-5s %-7s %3s | " + "%-105s | %-119s | %s\n" + "%-16s %-8s %-8s %-8s %-8s %-10s %-8s %-8s %-5s %-5s %-7s %3s | " "%-5s %-5s %-9s %-6s %-8s %-12s " "%-6s %-6s %-6s %-6s %-6s %-8s %-8s %-8s %-6s | " "%-6s %-6s %-8s %-8s %-6s %-6s %-6s %-8s %-8s\n", "dbuf", "arcbuf", "dnode", "pool", "objset", "object", "level", - "blkid", "offset", "dbsize", "meta", "state", "dbholds", "dbc", - "list", "atype", "flags", "count", "asize", "access", + "blkid", "offset", "dbsize", "usize", "meta", "state", "dbholds", + "dbc", "list", "atype", "flags", "count", "asize", "access", "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", "l2_asize", "l2_comp", "aholds", "dtype", "btype", "data_bs", "meta_bs", "bsize", "lvls", "dholds", "blocks", "dsize"); @@ -75,8 +75,8 @@ __dbuf_stats_hash_table_data(char *buf, size_t size, dmu_buf_impl_t *db) __dmu_object_info_from_dnode(dn, &doi); nwritten = snprintf(buf, size, - "%-16s %-8llu %-8lld %-8lld %-8lld %-10llu %-8llu %-5d %-5d " - "%-7lu %-3d | %-5d %-5d 0x%-7x %-6lu %-8llu %-12llu " + "%-16s %-8llu %-8lld %-8lld %-8lld %-10llu %-8llu %-8llu " + "%-5d %-5d %-7lu %-3d | %-5d %-5d 0x%-7x %-6lu %-8llu %-12llu " "%-6lu %-6lu %-6lu %-6lu %-6lu %-8llu %-8llu %-8d %-6lu | " "%-6d %-6d %-8lu %-8lu %-6llu %-6lu %-6lu %-8llu %-8llu\n", /* dmu_buf_impl_t */ @@ -87,6 +87,7 @@ __dbuf_stats_hash_table_data(char *buf, size_t size, dmu_buf_impl_t *db) (longlong_t)db->db_blkid, (u_longlong_t)db->db.db_offset, (u_longlong_t)db->db.db_size, + (u_longlong_t)dmu_buf_user_size(&db->db), !!dbuf_is_metadata(db), db->db_state, (ulong_t)zfs_refcount_count(&db->db_holds), diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index 79fd02dcb9..029d9df8af 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -1237,9 +1237,11 @@ dnode_check_slots_free(dnode_children_t *children, int idx, int slots) return (B_TRUE); } -static void +static uint_t dnode_reclaim_slots(dnode_children_t *children, int idx, int slots) { + uint_t reclaimed = 0; + ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); for (int i = idx; i < idx + slots; i++) { @@ -1251,8 +1253,11 @@ dnode_reclaim_slots(dnode_children_t *children, int idx, int slots) ASSERT3S(dnh->dnh_dnode->dn_type, ==, DMU_OT_NONE); dnode_destroy(dnh->dnh_dnode); dnh->dnh_dnode = DN_SLOT_FREE; + reclaimed++; } } + + return (reclaimed); } void @@ -1565,6 +1570,8 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, } else { dn = dnode_create(os, dn_block + idx, db, object, dnh); + dmu_buf_add_user_size(&db->db, + sizeof (dnode_t)); } } @@ -1622,8 +1629,13 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, * to be freed. Single slot dnodes can be safely * re-purposed as a performance optimization. */ - if (slots > 1) - dnode_reclaim_slots(dnc, idx + 1, slots - 1); + if (slots > 1) { + uint_t reclaimed = + dnode_reclaim_slots(dnc, idx + 1, slots - 1); + if (reclaimed > 0) + dmu_buf_sub_user_size(&db->db, + reclaimed * sizeof (dnode_t)); + } dnh = &dnc->dnc_children[idx]; if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) { @@ -1631,6 +1643,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, } else { dn = dnode_create(os, dn_block + idx, db, object, dnh); + dmu_buf_add_user_size(&db->db, sizeof (dnode_t)); } mutex_enter(&dn->dn_mtx);