From 19bf54b76414b70866df28c6a66e521c7fef349f Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Mon, 25 Mar 2024 17:58:50 -0400 Subject: [PATCH] ZAP: Massively switch to _by_dnode() interfaces Before this change ZAP called dnode_hold() for almost every block access, that was clearly visible in profiler under heavy load, such as BRT. This patch makes it always hold the dnode reference between zap_lockdir() and zap_unlockdir(). It allows to avoid most of dnode operations between those. It also adds several new _by_dnode() APIs to ZAP and uses them in BRT code. Also adds dmu_prefetch_by_dnode() variant and uses it in the ZAP code. After this there remains only one call to dmu_buf_dnode_enter(), which seems to be unneeded. So remove the call and the functions. Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15951 --- include/sys/dmu.h | 4 +- include/sys/zap.h | 8 ++ include/sys/zap_impl.h | 1 + module/zfs/brt.c | 72 +++----------- module/zfs/dbuf.c | 15 --- module/zfs/dmu.c | 18 +++- module/zfs/dmu_recv.c | 7 +- module/zfs/zap.c | 43 ++++----- module/zfs/zap_micro.c | 206 +++++++++++++++++++++++++++++------------ 9 files changed, 202 insertions(+), 172 deletions(-) diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 5bdb7c0293..26b329b53f 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -739,8 +739,6 @@ void *dmu_buf_remove_user(dmu_buf_t *db, dmu_buf_user_t *user); void *dmu_buf_get_user(dmu_buf_t *db); objset_t *dmu_buf_get_objset(dmu_buf_t *db); -dnode_t *dmu_buf_dnode_enter(dmu_buf_t *db); -void dmu_buf_dnode_exit(dmu_buf_t *db); /* Block until any in-progress dmu buf user evictions complete. */ void dmu_buf_user_evict_wait(void); @@ -889,6 +887,8 @@ extern uint_t zfs_max_recordsize; */ void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, uint64_t len, enum zio_priority pri); +void dmu_prefetch_by_dnode(dnode_t *dn, int64_t level, uint64_t offset, + uint64_t len, enum zio_priority pri); void dmu_prefetch_dnode(objset_t *os, uint64_t object, enum zio_priority pri); typedef struct dmu_object_info { diff --git a/include/sys/zap.h b/include/sys/zap.h index 308a7c7284..96ddcc324b 100644 --- a/include/sys/zap.h +++ b/include/sys/zap.h @@ -253,6 +253,9 @@ int zap_add_by_dnode(dnode_t *dn, const char *key, int zap_add_uint64(objset_t *ds, uint64_t zapobj, const uint64_t *key, int key_numints, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); +int zap_add_uint64_by_dnode(dnode_t *dn, const uint64_t *key, + int key_numints, int integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx); /* * Set the attribute with the given name to the given value. If an @@ -267,6 +270,9 @@ int zap_update(objset_t *ds, uint64_t zapobj, const char *name, int zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); +int zap_update_uint64_by_dnode(dnode_t *dn, const uint64_t *key, + int key_numints, + int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); /* * Get the length (in integers) and the integer size of the specified @@ -292,6 +298,8 @@ int zap_remove_norm(objset_t *ds, uint64_t zapobj, const char *name, int zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx); int zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, dmu_tx_t *tx); +int zap_remove_uint64_by_dnode(dnode_t *dn, const uint64_t *key, + int key_numints, dmu_tx_t *tx); /* * Returns (in *count) the number of attributes in the specified zap diff --git a/include/sys/zap_impl.h b/include/sys/zap_impl.h index 74853f5fac..2959aa9b2c 100644 --- a/include/sys/zap_impl.h +++ b/include/sys/zap_impl.h @@ -145,6 +145,7 @@ typedef struct zap { dmu_buf_user_t zap_dbu; objset_t *zap_objset; uint64_t zap_object; + dnode_t *zap_dnode; struct dmu_buf *zap_dbuf; krwlock_t zap_rwlock; boolean_t zap_ismicro; diff --git a/module/zfs/brt.c b/module/zfs/brt.c index 7ddec0b4b9..5e10df9dfe 100644 --- a/module/zfs/brt.c +++ b/module/zfs/brt.c @@ -955,52 +955,10 @@ brt_entry_prefetch(brt_t *brt, uint64_t vdevid, brt_entry_t *bre) if (mos_entries == 0) return; - BRT_DEBUG("ZAP prefetch: object=%llu vdev=%llu offset=%llu", - (u_longlong_t)mos_entries, (u_longlong_t)vdevid, - (u_longlong_t)bre->bre_offset); (void) zap_prefetch_uint64(brt->brt_mos, mos_entries, (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS); } -static int -brt_entry_update(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx) -{ - int error; - - ASSERT(RW_LOCK_HELD(&brt->brt_lock)); - ASSERT(brtvd->bv_mos_entries != 0); - ASSERT(bre->bre_refcount > 0); - - error = zap_update_uint64(brt->brt_mos, brtvd->bv_mos_entries, - (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS, 1, - sizeof (bre->bre_refcount), &bre->bre_refcount, tx); - BRT_DEBUG("ZAP update: object=%llu vdev=%llu offset=%llu count=%llu " - "error=%d", (u_longlong_t)brtvd->bv_mos_entries, - (u_longlong_t)brtvd->bv_vdevid, (u_longlong_t)bre->bre_offset, - (u_longlong_t)bre->bre_refcount, error); - - return (error); -} - -static int -brt_entry_remove(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx) -{ - int error; - - ASSERT(RW_LOCK_HELD(&brt->brt_lock)); - ASSERT(brtvd->bv_mos_entries != 0); - ASSERT0(bre->bre_refcount); - - error = zap_remove_uint64(brt->brt_mos, brtvd->bv_mos_entries, - (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS, tx); - BRT_DEBUG("ZAP remove: object=%llu vdev=%llu offset=%llu count=%llu " - "error=%d", (u_longlong_t)brtvd->bv_mos_entries, - (u_longlong_t)brtvd->bv_vdevid, (u_longlong_t)bre->bre_offset, - (u_longlong_t)bre->bre_refcount, error); - - return (error); -} - /* * Return TRUE if we _can_ have BRT entry for this bp. It might be false * positive, but gives us quick answer if we should look into BRT, which @@ -1559,24 +1517,16 @@ brt_pending_apply(spa_t *spa, uint64_t txg) } static void -brt_sync_entry(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx) +brt_sync_entry(dnode_t *dn, brt_entry_t *bre, dmu_tx_t *tx) { - - ASSERT(RW_WRITE_HELD(&brt->brt_lock)); - ASSERT(brtvd->bv_mos_entries != 0); - if (bre->bre_refcount == 0) { - int error; - - error = brt_entry_remove(brt, brtvd, bre, tx); - ASSERT(error == 0 || error == ENOENT); - /* - * If error == ENOENT then zfs_clone_range() was done from a - * removed (but opened) file (open(), unlink()). - */ - ASSERT(brt_entry_lookup(brt, brtvd, bre) == ENOENT); + int error = zap_remove_uint64_by_dnode(dn, &bre->bre_offset, + BRT_KEY_WORDS, tx); + VERIFY(error == 0 || error == ENOENT); } else { - VERIFY0(brt_entry_update(brt, brtvd, bre, tx)); + VERIFY0(zap_update_uint64_by_dnode(dn, &bre->bre_offset, + BRT_KEY_WORDS, 1, sizeof (bre->bre_refcount), + &bre->bre_refcount, tx)); } } @@ -1585,6 +1535,7 @@ brt_sync_table(brt_t *brt, dmu_tx_t *tx) { brt_vdev_t *brtvd; brt_entry_t *bre; + dnode_t *dn; uint64_t vdevid; void *c; @@ -1608,14 +1559,19 @@ brt_sync_table(brt_t *brt, dmu_tx_t *tx) if (brtvd->bv_mos_brtvdev == 0) brt_vdev_create(brt, brtvd, tx); + VERIFY0(dnode_hold(brt->brt_mos, brtvd->bv_mos_entries, + FTAG, &dn)); + c = NULL; while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) { - brt_sync_entry(brt, brtvd, bre, tx); + brt_sync_entry(dn, bre, tx); brt_entry_free(bre); ASSERT(brt->brt_nentries > 0); brt->brt_nentries--; } + dnode_rele(dn, FTAG); + brt_vdev_sync(brt, brtvd, tx); if (brtvd->bv_totalcount == 0) diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 280001bc34..ae5657d762 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -4127,21 +4127,6 @@ dmu_buf_get_objset(dmu_buf_t *db) return (dbi->db_objset); } -dnode_t * -dmu_buf_dnode_enter(dmu_buf_t *db) -{ - dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; - DB_DNODE_ENTER(dbi); - return (DB_DNODE(dbi)); -} - -void -dmu_buf_dnode_exit(dmu_buf_t *db) -{ - dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; - DB_DNODE_EXIT(dbi); -} - static void dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) { diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index d82211e6d4..8986f55e79 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -712,8 +712,6 @@ dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, uint64_t len, zio_priority_t pri) { dnode_t *dn; - int64_t level2 = level; - uint64_t start, end, start2, end2; if (dmu_prefetch_max == 0 || len == 0) { dmu_prefetch_dnode(os, object, pri); @@ -723,6 +721,18 @@ dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, if (dnode_hold(os, object, FTAG, &dn) != 0) return; + dmu_prefetch_by_dnode(dn, level, offset, len, pri); + + dnode_rele(dn, FTAG); +} + +void +dmu_prefetch_by_dnode(dnode_t *dn, int64_t level, uint64_t offset, + uint64_t len, zio_priority_t pri) +{ + int64_t level2 = level; + uint64_t start, end, start2, end2; + /* * Depending on len we may do two prefetches: blocks [start, end) at * level, and following blocks [start2, end2) at higher level2. @@ -762,8 +772,6 @@ dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, for (uint64_t i = start2; i < end2; i++) dbuf_prefetch(dn, level2, i, pri, 0); rw_exit(&dn->dn_struct_rwlock); - - dnode_rele(dn, FTAG); } /* @@ -2563,6 +2571,8 @@ EXPORT_SYMBOL(dmu_bonus_hold_by_dnode); EXPORT_SYMBOL(dmu_buf_hold_array_by_bonus); EXPORT_SYMBOL(dmu_buf_rele_array); EXPORT_SYMBOL(dmu_prefetch); +EXPORT_SYMBOL(dmu_prefetch_by_dnode); +EXPORT_SYMBOL(dmu_prefetch_dnode); EXPORT_SYMBOL(dmu_free_range); EXPORT_SYMBOL(dmu_free_long_range); EXPORT_SYMBOL(dmu_free_long_object); diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c index 2cf1090973..9f1c25f866 100644 --- a/module/zfs/dmu_recv.c +++ b/module/zfs/dmu_recv.c @@ -2353,7 +2353,6 @@ receive_process_write_record(struct receive_writer_arg *rwa, if (rwa->heal) { blkptr_t *bp; dmu_buf_t *dbp; - dnode_t *dn; int flags = DB_RF_CANFAIL; if (rwa->raw) @@ -2385,19 +2384,15 @@ receive_process_write_record(struct receive_writer_arg *rwa, dmu_buf_rele(dbp, FTAG); return (err); } - dn = dmu_buf_dnode_enter(dbp); /* Make sure the on-disk block and recv record sizes match */ - if (drrw->drr_logical_size != - dn->dn_datablkszsec << SPA_MINBLOCKSHIFT) { + if (drrw->drr_logical_size != dbp->db_size) { err = ENOTSUP; - dmu_buf_dnode_exit(dbp); dmu_buf_rele(dbp, FTAG); return (err); } /* Get the block pointer for the corrupted block */ bp = dmu_buf_get_blkptr(dbp); err = do_corrective_recv(rwa, drrw, rrd, bp); - dmu_buf_dnode_exit(dbp); dmu_buf_rele(dbp, FTAG); return (err); } diff --git a/module/zfs/zap.c b/module/zfs/zap.c index dde05d7005..da86defb44 100644 --- a/module/zfs/zap.c +++ b/module/zfs/zap.c @@ -133,7 +133,7 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags) * set up block 1 - the first leaf */ dmu_buf_t *db; - VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object, + VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode, 1<zt_numblks * 2); tbl->zt_nextblk = newblk; ASSERT0(tbl->zt_blks_copied); - dmu_prefetch(zap->zap_objset, zap->zap_object, 0, + dmu_prefetch_by_dnode(zap->zap_dnode, 0, tbl->zt_blk << bs, tbl->zt_numblks << bs, ZIO_PRIORITY_SYNC_READ); } @@ -193,21 +193,21 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl, uint64_t b = tbl->zt_blks_copied; dmu_buf_t *db_old; - int err = dmu_buf_hold(zap->zap_objset, zap->zap_object, + int err = dmu_buf_hold_by_dnode(zap->zap_dnode, (tbl->zt_blk + b) << bs, FTAG, &db_old, DMU_READ_NO_PREFETCH); if (err != 0) return (err); /* first half of entries in old[b] go to new[2*b+0] */ dmu_buf_t *db_new; - VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object, + VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode, (newblk + 2*b+0) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH)); dmu_buf_will_dirty(db_new, tx); transfer_func(db_old->db_data, db_new->db_data, hepb); dmu_buf_rele(db_new, FTAG); /* second half of entries in old[b] go to new[2*b+1] */ - VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object, + VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode, (newblk + 2*b+1) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH)); dmu_buf_will_dirty(db_new, tx); transfer_func((uint64_t *)db_old->db_data + hepb, @@ -255,7 +255,7 @@ zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val, uint64_t off = idx & ((1<<(bs-3))-1); dmu_buf_t *db; - int err = dmu_buf_hold(zap->zap_objset, zap->zap_object, + int err = dmu_buf_hold_by_dnode(zap->zap_dnode, (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); if (err != 0) return (err); @@ -267,7 +267,7 @@ zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val, uint64_t off2 = idx2 & ((1<<(bs-3))-1); dmu_buf_t *db2; - err = dmu_buf_hold(zap->zap_objset, zap->zap_object, + err = dmu_buf_hold_by_dnode(zap->zap_dnode, (tbl->zt_nextblk + blk2) << bs, FTAG, &db2, DMU_READ_NO_PREFETCH); if (err != 0) { @@ -296,16 +296,9 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp) uint64_t blk = idx >> (bs-3); uint64_t off = idx & ((1<<(bs-3))-1); - /* - * Note: this is equivalent to dmu_buf_hold(), but we use - * _dnode_enter / _by_dnode because it's faster because we don't - * have to hold the dnode. - */ - dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf); dmu_buf_t *db; - int err = dmu_buf_hold_by_dnode(dn, + int err = dmu_buf_hold_by_dnode(zap->zap_dnode, (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); - dmu_buf_dnode_exit(zap->zap_dbuf); if (err != 0) return (err); *valp = ((uint64_t *)db->db_data)[off]; @@ -319,11 +312,9 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp) */ blk = (idx*2) >> (bs-3); - dn = dmu_buf_dnode_enter(zap->zap_dbuf); - err = dmu_buf_hold_by_dnode(dn, + err = dmu_buf_hold_by_dnode(zap->zap_dnode, (tbl->zt_nextblk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); - dmu_buf_dnode_exit(zap->zap_dbuf); if (err == 0) dmu_buf_rele(db, FTAG); } @@ -368,7 +359,7 @@ zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx) uint64_t newblk = zap_allocate_blocks(zap, 1); dmu_buf_t *db_new; - int err = dmu_buf_hold(zap->zap_objset, zap->zap_object, + int err = dmu_buf_hold_by_dnode(zap->zap_dnode, newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new, DMU_READ_NO_PREFETCH); if (err != 0) @@ -433,7 +424,7 @@ zap_create_leaf(zap_t *zap, dmu_tx_t *tx) l->l_blkid = zap_allocate_blocks(zap, 1); l->l_dbuf = NULL; - VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object, + VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode, l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf, DMU_READ_NO_PREFETCH)); dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf); @@ -533,10 +524,8 @@ zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt, return (SET_ERROR(ENOENT)); int bs = FZAP_BLOCK_SHIFT(zap); - dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf); - int err = dmu_buf_hold_by_dnode(dn, + int err = dmu_buf_hold_by_dnode(zap->zap_dnode, blkid << bs, NULL, &db, DMU_READ_NO_PREFETCH); - dmu_buf_dnode_exit(zap->zap_dbuf); if (err != 0) return (err); @@ -985,7 +974,7 @@ fzap_prefetch(zap_name_t *zn) if (zap_idx_to_blk(zap, idx, &blk) != 0) return; int bs = FZAP_BLOCK_SHIFT(zap); - dmu_prefetch(zap->zap_objset, zap->zap_object, 0, blk << bs, 1 << bs, + dmu_prefetch_by_dnode(zap->zap_dnode, 0, blk << bs, 1 << bs, ZIO_PRIORITY_SYNC_READ); } @@ -1228,7 +1217,7 @@ fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za) */ if (zc->zc_hash == 0 && zap_iterate_prefetch && zc->zc_prefetch && zap_f_phys(zap)->zap_freeblk > 2) { - dmu_prefetch(zc->zc_objset, zc->zc_zapobj, 0, 0, + dmu_prefetch_by_dnode(zap->zap_dnode, 0, 0, zap_f_phys(zap)->zap_freeblk << FZAP_BLOCK_SHIFT(zap), ZIO_PRIORITY_ASYNC_READ); } @@ -1356,7 +1345,7 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs) zap_stats_ptrtbl(zap, &ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap), zs); } else { - dmu_prefetch(zap->zap_objset, zap->zap_object, 0, + dmu_prefetch_by_dnode(zap->zap_dnode, 0, zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs, zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs, ZIO_PRIORITY_SYNC_READ); @@ -1366,7 +1355,7 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs) dmu_buf_t *db; int err; - err = dmu_buf_hold(zap->zap_objset, zap->zap_object, + err = dmu_buf_hold_by_dnode(zap->zap_dnode, (zap_f_phys(zap)->zap_ptrtbl.zt_blk + b) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); if (err == 0) { diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c index 085d9cd8b4..d806988af9 100644 --- a/module/zfs/zap_micro.c +++ b/module/zfs/zap_micro.c @@ -415,7 +415,7 @@ mze_destroy(zap_t *zap) } static zap_t * -mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db) +mzap_open(dmu_buf_t *db) { zap_t *winner; uint64_t *zap_hdr = (uint64_t *)db->db_data; @@ -427,8 +427,8 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db) zap_t *zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP); rw_init(&zap->zap_rwlock, NULL, RW_DEFAULT, NULL); rw_enter(&zap->zap_rwlock, RW_WRITER); - zap->zap_objset = os; - zap->zap_object = obj; + zap->zap_objset = dmu_buf_get_objset(db); + zap->zap_object = db->db_object; zap->zap_dbuf = db; if (zap_block_type != ZBT_MICRO) { @@ -518,7 +518,7 @@ handle_winner: * have the specified tag. */ static int -zap_lockdir_impl(dmu_buf_t *db, const void *tag, dmu_tx_t *tx, +zap_lockdir_impl(dnode_t *dn, dmu_buf_t *db, const void *tag, dmu_tx_t *tx, krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp) { ASSERT0(db->db_offset); @@ -528,13 +528,13 @@ zap_lockdir_impl(dmu_buf_t *db, const void *tag, dmu_tx_t *tx, *zapp = NULL; - dmu_object_info_from_db(db, &doi); + dmu_object_info_from_dnode(dn, &doi); if (DMU_OT_BYTESWAP(doi.doi_type) != DMU_BSWAP_ZAP) return (SET_ERROR(EINVAL)); zap_t *zap = dmu_buf_get_user(db); if (zap == NULL) { - zap = mzap_open(os, obj, db); + zap = mzap_open(db); if (zap == NULL) { /* * mzap_open() didn't like what it saw on-disk. @@ -563,6 +563,7 @@ zap_lockdir_impl(dmu_buf_t *db, const void *tag, dmu_tx_t *tx, } zap->zap_objset = os; + zap->zap_dnode = dn; if (lt == RW_WRITER) dmu_buf_will_dirty(db, tx); @@ -598,23 +599,16 @@ zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx, zap_t **zapp) { dmu_buf_t *db; + int err; - int err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH); - if (err != 0) { + err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH); + if (err != 0) return (err); - } -#ifdef ZFS_DEBUG - { - dmu_object_info_t doi; - dmu_object_info_from_db(db, &doi); - ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP); - } -#endif - - err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp); - if (err != 0) { + err = zap_lockdir_impl(dn, db, tag, tx, lti, fatreader, adding, zapp); + if (err != 0) dmu_buf_rele(db, tag); - } + else + VERIFY(dnode_add_ref(dn, tag)); return (err); } @@ -623,21 +617,23 @@ zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag, zap_t **zapp) { + dnode_t *dn; dmu_buf_t *db; + int err; - int err = dmu_buf_hold(os, obj, 0, tag, &db, DMU_READ_NO_PREFETCH); + err = dnode_hold(os, obj, tag, &dn); if (err != 0) return (err); -#ifdef ZFS_DEBUG - { - dmu_object_info_t doi; - dmu_object_info_from_db(db, &doi); - ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP); + err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH); + if (err != 0) { + dnode_rele(dn, tag); + return (err); } -#endif - err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp); - if (err != 0) + err = zap_lockdir_impl(dn, db, tag, tx, lti, fatreader, adding, zapp); + if (err != 0) { dmu_buf_rele(db, tag); + dnode_rele(dn, tag); + } return (err); } @@ -645,6 +641,7 @@ void zap_unlockdir(zap_t *zap, const void *tag) { rw_exit(&zap->zap_rwlock); + dnode_rele(zap->zap_dnode, tag); dmu_buf_rele(zap->zap_dbuf, tag); } @@ -730,7 +727,8 @@ mzap_create_impl(dnode_t *dn, int normflags, zap_flags_t flags, dmu_tx_t *tx) if (flags != 0) { zap_t *zap; /* Only fat zap supports flags; upgrade immediately. */ - VERIFY0(zap_lockdir_impl(db, FTAG, tx, RW_WRITER, + VERIFY(dnode_add_ref(dn, FTAG)); + VERIFY0(zap_lockdir_impl(dn, db, FTAG, tx, RW_WRITER, B_FALSE, B_FALSE, &zap)); VERIFY0(mzap_upgrade(&zap, FTAG, tx, flags)); zap_unlockdir(zap, FTAG); @@ -1325,6 +1323,26 @@ zap_add_by_dnode(dnode_t *dn, const char *key, return (err); } +static int +zap_add_uint64_impl(zap_t *zap, const uint64_t *key, + int key_numints, int integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx, const void *tag) +{ + int err; + + zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); + if (zn == NULL) { + zap_unlockdir(zap, tag); + return (SET_ERROR(ENOTSUP)); + } + err = fzap_add(zn, integer_size, num_integers, val, tag, tx); + zap = zn->zn_zap; /* fzap_add() may change zap */ + zap_name_free(zn); + if (zap != NULL) /* may be NULL if fzap_add() failed */ + zap_unlockdir(zap, tag); + return (err); +} + int zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, int integer_size, uint64_t num_integers, @@ -1336,16 +1354,26 @@ zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); if (err != 0) return (err); - zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); - if (zn == NULL) { - zap_unlockdir(zap, FTAG); - return (SET_ERROR(ENOTSUP)); - } - err = fzap_add(zn, integer_size, num_integers, val, FTAG, tx); - zap = zn->zn_zap; /* fzap_add() may change zap */ - zap_name_free(zn); - if (zap != NULL) /* may be NULL if fzap_add() failed */ - zap_unlockdir(zap, FTAG); + err = zap_add_uint64_impl(zap, key, key_numints, + integer_size, num_integers, val, tx, FTAG); + /* zap_add_uint64_impl() calls zap_unlockdir() */ + return (err); +} + +int +zap_add_uint64_by_dnode(dnode_t *dn, const uint64_t *key, + int key_numints, int integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx) +{ + zap_t *zap; + + int err = + zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); + if (err != 0) + return (err); + err = zap_add_uint64_impl(zap, key, key_numints, + integer_size, num_integers, val, tx, FTAG); + /* zap_add_uint64_impl() calls zap_unlockdir() */ return (err); } @@ -1396,10 +1424,30 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name, return (err); } +static int +zap_update_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints, + int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx, + const void *tag) +{ + int err; + + zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); + if (zn == NULL) { + zap_unlockdir(zap, tag); + return (SET_ERROR(ENOTSUP)); + } + err = fzap_update(zn, integer_size, num_integers, val, tag, tx); + zap = zn->zn_zap; /* fzap_update() may change zap */ + zap_name_free(zn); + if (zap != NULL) /* may be NULL if fzap_upgrade() failed */ + zap_unlockdir(zap, tag); + return (err); +} + int zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, - int key_numints, - int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) + int key_numints, int integer_size, uint64_t num_integers, const void *val, + dmu_tx_t *tx) { zap_t *zap; @@ -1407,16 +1455,25 @@ zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); if (err != 0) return (err); - zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); - if (zn == NULL) { - zap_unlockdir(zap, FTAG); - return (SET_ERROR(ENOTSUP)); - } - err = fzap_update(zn, integer_size, num_integers, val, FTAG, tx); - zap = zn->zn_zap; /* fzap_update() may change zap */ - zap_name_free(zn); - if (zap != NULL) /* may be NULL if fzap_upgrade() failed */ - zap_unlockdir(zap, FTAG); + err = zap_update_uint64_impl(zap, key, key_numints, + integer_size, num_integers, val, tx, FTAG); + /* zap_update_uint64_impl() calls zap_unlockdir() */ + return (err); +} + +int +zap_update_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints, + int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) +{ + zap_t *zap; + + int err = + zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); + if (err != 0) + return (err); + err = zap_update_uint64_impl(zap, key, key_numints, + integer_size, num_integers, val, tx, FTAG); + /* zap_update_uint64_impl() calls zap_unlockdir() */ return (err); } @@ -1481,6 +1538,23 @@ zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx) return (err); } +static int +zap_remove_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints, + dmu_tx_t *tx, const void *tag) +{ + int err; + + zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); + if (zn == NULL) { + zap_unlockdir(zap, tag); + return (SET_ERROR(ENOTSUP)); + } + err = fzap_remove(zn, tx); + zap_name_free(zn); + zap_unlockdir(zap, tag); + return (err); +} + int zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, dmu_tx_t *tx) @@ -1491,14 +1565,23 @@ zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); - zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); - if (zn == NULL) { - zap_unlockdir(zap, FTAG); - return (SET_ERROR(ENOTSUP)); - } - err = fzap_remove(zn, tx); - zap_name_free(zn); - zap_unlockdir(zap, FTAG); + err = zap_remove_uint64_impl(zap, key, key_numints, tx, FTAG); + /* zap_remove_uint64_impl() calls zap_unlockdir() */ + return (err); +} + +int +zap_remove_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints, + dmu_tx_t *tx) +{ + zap_t *zap; + + int err = + zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); + if (err != 0) + return (err); + err = zap_remove_uint64_impl(zap, key, key_numints, tx, FTAG); + /* zap_remove_uint64_impl() calls zap_unlockdir() */ return (err); } @@ -1704,14 +1787,17 @@ EXPORT_SYMBOL(zap_prefetch_uint64); EXPORT_SYMBOL(zap_add); EXPORT_SYMBOL(zap_add_by_dnode); EXPORT_SYMBOL(zap_add_uint64); +EXPORT_SYMBOL(zap_add_uint64_by_dnode); EXPORT_SYMBOL(zap_update); EXPORT_SYMBOL(zap_update_uint64); +EXPORT_SYMBOL(zap_update_uint64_by_dnode); EXPORT_SYMBOL(zap_length); EXPORT_SYMBOL(zap_length_uint64); EXPORT_SYMBOL(zap_remove); EXPORT_SYMBOL(zap_remove_by_dnode); EXPORT_SYMBOL(zap_remove_norm); EXPORT_SYMBOL(zap_remove_uint64); +EXPORT_SYMBOL(zap_remove_uint64_by_dnode); EXPORT_SYMBOL(zap_count); EXPORT_SYMBOL(zap_value_search); EXPORT_SYMBOL(zap_join);