Optimize microzaps

Microzap on-disk format does not include a hash tree, expecting one to
be built in RAM during mzap_open().  The built tree is linked to DMU
user buffer, freed when original DMU buffer is dropped from cache. I've
found that workloads accessing many large directories and having active
eviction from DMU cache spend significant amount of time building and
then destroying the trees.  I've also found that for each 64 byte mzap
element additional 64 byte tree element is allocated, that is a waste
of memory and CPU caches.

Improve memory efficiency of the hash tree by switching from AVL-tree
to B-tree.  It allows to save 24 bytes per element just on pointers.
Save 32 bits on mze_hash by storing only upper 32 bits since lower 32
bits are always zero for microzaps.  Save 16 bits on mze_chunkid, since
microzap can never have so many elements.  Respectively with the 16 bits
there can be no more than 16 bits of collision differentiators.  As
result, struct mzap_ent now drops from 48 (rounded to 64) to 8 bytes.

Tune B-trees for small data.  Reduce BTREE_CORE_ELEMS from 128 to 126
to allow struct zfs_btree_core in case of 8 byte elements to pack into
2KB instead of 4KB.  Aside of the microzaps it should also help 32bit
range trees.  Allow custom B-tree leaf size to reduce memmove() time.

Split zap_name_alloc() into zap_name_alloc() and zap_name_init_str().
It allows to not waste time allocating/freeing memory when processing
multiple names in a loop during mzap_open().

Together on a pool with 10K directories of 1800 files each and DMU
cache limited to 128MB this reduces time of `find . -name zzz` by 41%
from 7.63s to 4.47s, and saves additional ~30% of CPU time on the DMU
cache reclamation.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #14039
This commit is contained in:
Alexander Motin 2022-10-20 14:57:15 -04:00 committed by GitHub
parent 9650b35e95
commit 9dcdee7889
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 188 additions and 133 deletions

View File

@ -65,7 +65,7 @@ extern "C" {
* them, and increased memory overhead. Increasing these values results in * them, and increased memory overhead. Increasing these values results in
* higher variance in operation time, and reduces memory overhead. * higher variance in operation time, and reduces memory overhead.
*/ */
#define BTREE_CORE_ELEMS 128 #define BTREE_CORE_ELEMS 126
#define BTREE_LEAF_SIZE 4096 #define BTREE_LEAF_SIZE 4096
extern kmem_cache_t *zfs_btree_leaf_cache; extern kmem_cache_t *zfs_btree_leaf_cache;
@ -95,9 +95,6 @@ typedef struct zfs_btree_leaf {
uint8_t btl_elems[]; uint8_t btl_elems[];
} zfs_btree_leaf_t; } zfs_btree_leaf_t;
#define BTREE_LEAF_ESIZE (BTREE_LEAF_SIZE - \
offsetof(zfs_btree_leaf_t, btl_elems))
typedef struct zfs_btree_index { typedef struct zfs_btree_index {
zfs_btree_hdr_t *bti_node; zfs_btree_hdr_t *bti_node;
uint32_t bti_offset; uint32_t bti_offset;
@ -109,14 +106,15 @@ typedef struct zfs_btree_index {
} zfs_btree_index_t; } zfs_btree_index_t;
typedef struct btree { typedef struct btree {
zfs_btree_hdr_t *bt_root; int (*bt_compar) (const void *, const void *);
int64_t bt_height;
size_t bt_elem_size; size_t bt_elem_size;
size_t bt_leaf_size;
uint32_t bt_leaf_cap; uint32_t bt_leaf_cap;
int32_t bt_height;
uint64_t bt_num_elems; uint64_t bt_num_elems;
uint64_t bt_num_nodes; uint64_t bt_num_nodes;
zfs_btree_hdr_t *bt_root;
zfs_btree_leaf_t *bt_bulk; // non-null if bulk loading zfs_btree_leaf_t *bt_bulk; // non-null if bulk loading
int (*bt_compar) (const void *, const void *);
} zfs_btree_t; } zfs_btree_t;
/* /*
@ -132,9 +130,12 @@ void zfs_btree_fini(void);
* compar - function to compare two nodes, it must return exactly: -1, 0, or +1 * compar - function to compare two nodes, it must return exactly: -1, 0, or +1
* -1 for <, 0 for ==, and +1 for > * -1 for <, 0 for ==, and +1 for >
* size - the value of sizeof(struct my_type) * size - the value of sizeof(struct my_type)
* lsize - custom leaf size
*/ */
void zfs_btree_create(zfs_btree_t *, int (*) (const void *, const void *), void zfs_btree_create(zfs_btree_t *, int (*) (const void *, const void *),
size_t); size_t);
void zfs_btree_create_custom(zfs_btree_t *, int (*)(const void *, const void *),
size_t, size_t);
/* /*
* Find a node with a matching value in the tree. Returns the matching node * Find a node with a matching value in the tree. Returns the matching node

View File

@ -66,10 +66,9 @@ typedef struct mzap_phys {
} mzap_phys_t; } mzap_phys_t;
typedef struct mzap_ent { typedef struct mzap_ent {
avl_node_t mze_node; uint32_t mze_hash;
int mze_chunkid; uint16_t mze_cd; /* copy from mze_phys->mze_cd */
uint64_t mze_hash; uint16_t mze_chunkid;
uint32_t mze_cd; /* copy from mze_phys->mze_cd */
} mzap_ent_t; } mzap_ent_t;
#define MZE_PHYS(zap, mze) \ #define MZE_PHYS(zap, mze) \
@ -164,7 +163,7 @@ typedef struct zap {
int16_t zap_num_entries; int16_t zap_num_entries;
int16_t zap_num_chunks; int16_t zap_num_chunks;
int16_t zap_alloc_next; int16_t zap_alloc_next;
avl_tree_t zap_avl; zfs_btree_t zap_tree;
} zap_micro; } zap_micro;
} zap_u; } zap_u;
} zap_t; } zap_t;
@ -203,7 +202,7 @@ int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
zap_t **zapp); zap_t **zapp);
void zap_unlockdir(zap_t *zap, const void *tag); void zap_unlockdir(zap_t *zap, const void *tag);
void zap_evict_sync(void *dbu); void zap_evict_sync(void *dbu);
zap_name_t *zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt); zap_name_t *zap_name_alloc_str(zap_t *zap, const char *key, matchtype_t mt);
void zap_name_free(zap_name_t *zn); void zap_name_free(zap_name_t *zn);
int zap_hashbits(zap_t *zap); int zap_hashbits(zap_t *zap);
uint32_t zap_maxcd(zap_t *zap); uint32_t zap_maxcd(zap_t *zap);

View File

@ -102,7 +102,7 @@ zfs_btree_poison_node(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
(void) memset(leaf->btl_elems, 0x0f, hdr->bth_first * size); (void) memset(leaf->btl_elems, 0x0f, hdr->bth_first * size);
(void) memset(leaf->btl_elems + (void) memset(leaf->btl_elems +
(hdr->bth_first + hdr->bth_count) * size, 0x0f, (hdr->bth_first + hdr->bth_count) * size, 0x0f,
BTREE_LEAF_ESIZE - tree->bt_leaf_size - offsetof(zfs_btree_leaf_t, btl_elems) -
(hdr->bth_first + hdr->bth_count) * size); (hdr->bth_first + hdr->bth_count) * size);
} }
#endif #endif
@ -173,16 +173,44 @@ zfs_btree_fini(void)
kmem_cache_destroy(zfs_btree_leaf_cache); kmem_cache_destroy(zfs_btree_leaf_cache);
} }
static void *
zfs_btree_leaf_alloc(zfs_btree_t *tree)
{
if (tree->bt_leaf_size == BTREE_LEAF_SIZE)
return (kmem_cache_alloc(zfs_btree_leaf_cache, KM_SLEEP));
else
return (kmem_alloc(tree->bt_leaf_size, KM_SLEEP));
}
static void
zfs_btree_leaf_free(zfs_btree_t *tree, void *ptr)
{
if (tree->bt_leaf_size == BTREE_LEAF_SIZE)
return (kmem_cache_free(zfs_btree_leaf_cache, ptr));
else
return (kmem_free(ptr, tree->bt_leaf_size));
}
void void
zfs_btree_create(zfs_btree_t *tree, int (*compar) (const void *, const void *), zfs_btree_create(zfs_btree_t *tree, int (*compar) (const void *, const void *),
size_t size) size_t size)
{ {
ASSERT3U(size, <=, BTREE_LEAF_ESIZE / 2); zfs_btree_create_custom(tree, compar, size, BTREE_LEAF_SIZE);
}
void
zfs_btree_create_custom(zfs_btree_t *tree,
int (*compar) (const void *, const void *),
size_t size, size_t lsize)
{
size_t esize = lsize - offsetof(zfs_btree_leaf_t, btl_elems);
ASSERT3U(size, <=, esize / 2);
memset(tree, 0, sizeof (*tree)); memset(tree, 0, sizeof (*tree));
tree->bt_compar = compar; tree->bt_compar = compar;
tree->bt_elem_size = size; tree->bt_elem_size = size;
tree->bt_leaf_cap = P2ALIGN(BTREE_LEAF_ESIZE / size, 2); tree->bt_leaf_size = lsize;
tree->bt_leaf_cap = P2ALIGN(esize / size, 2);
tree->bt_height = -1; tree->bt_height = -1;
tree->bt_bulk = NULL; tree->bt_bulk = NULL;
} }
@ -290,7 +318,7 @@ zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where)
zfs_btree_core_t *node = NULL; zfs_btree_core_t *node = NULL;
uint32_t child = 0; uint32_t child = 0;
uint64_t depth = 0; uint32_t depth = 0;
/* /*
* Iterate down the tree, finding which child the value should be in * Iterate down the tree, finding which child the value should be in
@ -811,8 +839,7 @@ zfs_btree_insert_into_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf,
move_count++; move_count++;
} }
tree->bt_num_nodes++; tree->bt_num_nodes++;
zfs_btree_leaf_t *new_leaf = kmem_cache_alloc(zfs_btree_leaf_cache, zfs_btree_leaf_t *new_leaf = zfs_btree_leaf_alloc(tree);
KM_SLEEP);
zfs_btree_hdr_t *new_hdr = &new_leaf->btl_hdr; zfs_btree_hdr_t *new_hdr = &new_leaf->btl_hdr;
new_hdr->bth_parent = leaf->btl_hdr.bth_parent; new_hdr->bth_parent = leaf->btl_hdr.bth_parent;
new_hdr->bth_first = (tree->bt_bulk ? 0 : capacity / 4) + new_hdr->bth_first = (tree->bt_bulk ? 0 : capacity / 4) +
@ -1078,8 +1105,7 @@ zfs_btree_add_idx(zfs_btree_t *tree, const void *value,
ASSERT0(where->bti_offset); ASSERT0(where->bti_offset);
tree->bt_num_nodes++; tree->bt_num_nodes++;
zfs_btree_leaf_t *leaf = kmem_cache_alloc(zfs_btree_leaf_cache, zfs_btree_leaf_t *leaf = zfs_btree_leaf_alloc(tree);
KM_SLEEP);
tree->bt_root = &leaf->btl_hdr; tree->bt_root = &leaf->btl_hdr;
tree->bt_height++; tree->bt_height++;
@ -1378,7 +1404,7 @@ zfs_btree_node_destroy(zfs_btree_t *tree, zfs_btree_hdr_t *node)
{ {
tree->bt_num_nodes--; tree->bt_num_nodes--;
if (!zfs_btree_is_core(node)) { if (!zfs_btree_is_core(node)) {
kmem_cache_free(zfs_btree_leaf_cache, node); zfs_btree_leaf_free(tree, node);
} else { } else {
kmem_free(node, sizeof (zfs_btree_core_t) + kmem_free(node, sizeof (zfs_btree_core_t) +
BTREE_CORE_ELEMS * tree->bt_elem_size); BTREE_CORE_ELEMS * tree->bt_elem_size);
@ -1991,7 +2017,7 @@ zfs_btree_verify_counts(zfs_btree_t *tree)
*/ */
static uint64_t static uint64_t
zfs_btree_verify_height_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr, zfs_btree_verify_height_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr,
int64_t height) int32_t height)
{ {
if (!zfs_btree_is_core(hdr)) { if (!zfs_btree_is_core(hdr)) {
VERIFY0(height); VERIFY0(height);
@ -2117,8 +2143,10 @@ zfs_btree_verify_poison_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
for (size_t i = 0; i < hdr->bth_first * size; i++) for (size_t i = 0; i < hdr->bth_first * size; i++)
VERIFY3U(leaf->btl_elems[i], ==, 0x0f); VERIFY3U(leaf->btl_elems[i], ==, 0x0f);
size_t esize = tree->bt_leaf_size -
offsetof(zfs_btree_leaf_t, btl_elems);
for (size_t i = (hdr->bth_first + hdr->bth_count) * size; for (size_t i = (hdr->bth_first + hdr->bth_count) * size;
i < BTREE_LEAF_ESIZE; i++) i < esize; i++)
VERIFY3U(leaf->btl_elems[i], ==, 0x0f); VERIFY3U(leaf->btl_elems[i], ==, 0x0f);
} else { } else {
zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;

View File

@ -646,7 +646,7 @@ zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd,
* form of the name. But all callers have one of these on hand anyway, * form of the name. But all callers have one of these on hand anyway,
* so might as well take advantage. A cleaner but slower interface * so might as well take advantage. A cleaner but slower interface
* would accept neither argument, and compute the normalized name as * would accept neither argument, and compute the normalized name as
* needed (using zap_name_alloc(zap_entry_read_name(zeh))). * needed (using zap_name_alloc_str(zap_entry_read_name(zeh))).
*/ */
boolean_t boolean_t
zap_entry_normalization_conflict(zap_entry_handle_t *zeh, zap_name_t *zn, zap_entry_normalization_conflict(zap_entry_handle_t *zeh, zap_name_t *zn,
@ -667,7 +667,7 @@ zap_entry_normalization_conflict(zap_entry_handle_t *zeh, zap_name_t *zn,
continue; continue;
if (zn == NULL) { if (zn == NULL) {
zn = zap_name_alloc(zap, name, MT_NORMALIZE); zn = zap_name_alloc_str(zap, name, MT_NORMALIZE);
allocdzn = B_TRUE; allocdzn = B_TRUE;
} }
if (zap_leaf_array_match(zeh->zeh_leaf, zn, if (zap_leaf_array_match(zeh->zeh_leaf, zn,

View File

@ -33,7 +33,7 @@
#include <sys/zap.h> #include <sys/zap.h>
#include <sys/zap_impl.h> #include <sys/zap_impl.h>
#include <sys/zap_leaf.h> #include <sys/zap_leaf.h>
#include <sys/avl.h> #include <sys/btree.h>
#include <sys/arc.h> #include <sys/arc.h>
#include <sys/dmu_objset.h> #include <sys/dmu_objset.h>
@ -92,7 +92,7 @@ zap_hash(zap_name_t *zn)
wp++, i++) { wp++, i++) {
uint64_t word = *wp; uint64_t word = *wp;
for (int j = 0; j < zn->zn_key_intlen; j++) { for (int j = 0; j < 8; j++) {
h = (h >> 8) ^ h = (h >> 8) ^
zfs_crc64_table[(h ^ word) & 0xFF]; zfs_crc64_table[(h ^ word) & 0xFF];
word >>= NBBY; word >>= NBBY;
@ -162,18 +162,25 @@ zap_match(zap_name_t *zn, const char *matchname)
} }
} }
static zap_name_t *
zap_name_alloc(zap_t *zap)
{
zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP);
zn->zn_zap = zap;
return (zn);
}
void void
zap_name_free(zap_name_t *zn) zap_name_free(zap_name_t *zn)
{ {
kmem_free(zn, sizeof (zap_name_t)); kmem_free(zn, sizeof (zap_name_t));
} }
zap_name_t * static int
zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt) zap_name_init_str(zap_name_t *zn, const char *key, matchtype_t mt)
{ {
zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP); zap_t *zap = zn->zn_zap;
zn->zn_zap = zap;
zn->zn_key_intlen = sizeof (*key); zn->zn_key_intlen = sizeof (*key);
zn->zn_key_orig = key; zn->zn_key_orig = key;
zn->zn_key_orig_numints = strlen(zn->zn_key_orig) + 1; zn->zn_key_orig_numints = strlen(zn->zn_key_orig) + 1;
@ -194,17 +201,13 @@ zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt)
* what the hash is computed from. * what the hash is computed from.
*/ */
if (zap_normalize(zap, key, zn->zn_normbuf, if (zap_normalize(zap, key, zn->zn_normbuf,
zap->zap_normflags) != 0) { zap->zap_normflags) != 0)
zap_name_free(zn); return (SET_ERROR(ENOTSUP));
return (NULL);
}
zn->zn_key_norm = zn->zn_normbuf; zn->zn_key_norm = zn->zn_normbuf;
zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1; zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1;
} else { } else {
if (mt != 0) { if (mt != 0)
zap_name_free(zn); return (SET_ERROR(ENOTSUP));
return (NULL);
}
zn->zn_key_norm = zn->zn_key_orig; zn->zn_key_norm = zn->zn_key_orig;
zn->zn_key_norm_numints = zn->zn_key_orig_numints; zn->zn_key_norm_numints = zn->zn_key_orig_numints;
} }
@ -217,13 +220,22 @@ zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt)
* what the matching is based on. (Not the hash!) * what the matching is based on. (Not the hash!)
*/ */
if (zap_normalize(zap, key, zn->zn_normbuf, if (zap_normalize(zap, key, zn->zn_normbuf,
zn->zn_normflags) != 0) { zn->zn_normflags) != 0)
zap_name_free(zn); return (SET_ERROR(ENOTSUP));
return (NULL);
}
zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1; zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1;
} }
return (0);
}
zap_name_t *
zap_name_alloc_str(zap_t *zap, const char *key, matchtype_t mt)
{
zap_name_t *zn = zap_name_alloc(zap);
if (zap_name_init_str(zn, key, mt) != 0) {
zap_name_free(zn);
return (NULL);
}
return (zn); return (zn);
} }
@ -277,45 +289,46 @@ mze_compare(const void *arg1, const void *arg2)
const mzap_ent_t *mze1 = arg1; const mzap_ent_t *mze1 = arg1;
const mzap_ent_t *mze2 = arg2; const mzap_ent_t *mze2 = arg2;
int cmp = TREE_CMP(mze1->mze_hash, mze2->mze_hash); return (TREE_CMP((uint64_t)(mze1->mze_hash) << 32 | mze1->mze_cd,
if (likely(cmp)) (uint64_t)(mze2->mze_hash) << 32 | mze2->mze_cd));
return (cmp);
return (TREE_CMP(mze1->mze_cd, mze2->mze_cd));
} }
static void static void
mze_insert(zap_t *zap, int chunkid, uint64_t hash) mze_insert(zap_t *zap, uint16_t chunkid, uint64_t hash)
{ {
mzap_ent_t mze;
ASSERT(zap->zap_ismicro); ASSERT(zap->zap_ismicro);
ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
mzap_ent_t *mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP); mze.mze_chunkid = chunkid;
mze->mze_chunkid = chunkid; ASSERT0(hash & 0xffffffff);
mze->mze_hash = hash; mze.mze_hash = hash >> 32;
mze->mze_cd = MZE_PHYS(zap, mze)->mze_cd; ASSERT3U(MZE_PHYS(zap, &mze)->mze_cd, <=, 0xffff);
ASSERT(MZE_PHYS(zap, mze)->mze_name[0] != 0); mze.mze_cd = (uint16_t)MZE_PHYS(zap, &mze)->mze_cd;
avl_add(&zap->zap_m.zap_avl, mze); ASSERT(MZE_PHYS(zap, &mze)->mze_name[0] != 0);
zfs_btree_add(&zap->zap_m.zap_tree, &mze);
} }
static mzap_ent_t * static mzap_ent_t *
mze_find(zap_name_t *zn) mze_find(zap_name_t *zn, zfs_btree_index_t *idx)
{ {
mzap_ent_t mze_tofind; mzap_ent_t mze_tofind;
mzap_ent_t *mze; mzap_ent_t *mze;
avl_index_t idx; zfs_btree_t *tree = &zn->zn_zap->zap_m.zap_tree;
avl_tree_t *avl = &zn->zn_zap->zap_m.zap_avl;
ASSERT(zn->zn_zap->zap_ismicro); ASSERT(zn->zn_zap->zap_ismicro);
ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock)); ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock));
mze_tofind.mze_hash = zn->zn_hash; ASSERT0(zn->zn_hash & 0xffffffff);
mze_tofind.mze_hash = zn->zn_hash >> 32;
mze_tofind.mze_cd = 0; mze_tofind.mze_cd = 0;
mze = avl_find(avl, &mze_tofind, &idx); mze = zfs_btree_find(tree, &mze_tofind, idx);
if (mze == NULL) if (mze == NULL)
mze = avl_nearest(avl, idx, AVL_AFTER); mze = zfs_btree_next(tree, idx, idx);
for (; mze && mze->mze_hash == zn->zn_hash; mze = AVL_NEXT(avl, mze)) { for (; mze && mze->mze_hash == mze_tofind.mze_hash;
mze = zfs_btree_next(tree, idx, idx)) {
ASSERT3U(mze->mze_cd, ==, MZE_PHYS(zn->zn_zap, mze)->mze_cd); ASSERT3U(mze->mze_cd, ==, MZE_PHYS(zn->zn_zap, mze)->mze_cd);
if (zap_match(zn, MZE_PHYS(zn->zn_zap, mze)->mze_name)) if (zap_match(zn, MZE_PHYS(zn->zn_zap, mze)->mze_name))
return (mze); return (mze);
@ -328,18 +341,21 @@ static uint32_t
mze_find_unused_cd(zap_t *zap, uint64_t hash) mze_find_unused_cd(zap_t *zap, uint64_t hash)
{ {
mzap_ent_t mze_tofind; mzap_ent_t mze_tofind;
avl_index_t idx; zfs_btree_index_t idx;
avl_tree_t *avl = &zap->zap_m.zap_avl; zfs_btree_t *tree = &zap->zap_m.zap_tree;
ASSERT(zap->zap_ismicro); ASSERT(zap->zap_ismicro);
ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
ASSERT0(hash & 0xffffffff);
hash >>= 32;
mze_tofind.mze_hash = hash; mze_tofind.mze_hash = hash;
mze_tofind.mze_cd = 0; mze_tofind.mze_cd = 0;
uint32_t cd = 0; uint32_t cd = 0;
for (mzap_ent_t *mze = avl_find(avl, &mze_tofind, &idx); for (mzap_ent_t *mze = zfs_btree_find(tree, &mze_tofind, &idx);
mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) { mze && mze->mze_hash == hash;
mze = zfs_btree_next(tree, &idx, &idx)) {
if (mze->mze_cd != cd) if (mze->mze_cd != cd)
break; break;
cd++; cd++;
@ -364,16 +380,18 @@ mze_canfit_fzap_leaf(zap_name_t *zn, uint64_t hash)
{ {
zap_t *zap = zn->zn_zap; zap_t *zap = zn->zn_zap;
mzap_ent_t mze_tofind; mzap_ent_t mze_tofind;
mzap_ent_t *mze; zfs_btree_index_t idx;
avl_index_t idx; zfs_btree_t *tree = &zap->zap_m.zap_tree;
avl_tree_t *avl = &zap->zap_m.zap_avl;
uint32_t mzap_ents = 0; uint32_t mzap_ents = 0;
ASSERT0(hash & 0xffffffff);
hash >>= 32;
mze_tofind.mze_hash = hash; mze_tofind.mze_hash = hash;
mze_tofind.mze_cd = 0; mze_tofind.mze_cd = 0;
for (mze = avl_find(avl, &mze_tofind, &idx); for (mzap_ent_t *mze = zfs_btree_find(tree, &mze_tofind, &idx);
mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) { mze && mze->mze_hash == hash;
mze = zfs_btree_next(tree, &idx, &idx)) {
mzap_ents++; mzap_ents++;
} }
@ -383,25 +401,11 @@ mze_canfit_fzap_leaf(zap_name_t *zn, uint64_t hash)
return (ZAP_LEAF_NUMCHUNKS_DEF > (mzap_ents * MZAP_ENT_CHUNKS)); return (ZAP_LEAF_NUMCHUNKS_DEF > (mzap_ents * MZAP_ENT_CHUNKS));
} }
static void
mze_remove(zap_t *zap, mzap_ent_t *mze)
{
ASSERT(zap->zap_ismicro);
ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
avl_remove(&zap->zap_m.zap_avl, mze);
kmem_free(mze, sizeof (mzap_ent_t));
}
static void static void
mze_destroy(zap_t *zap) mze_destroy(zap_t *zap)
{ {
mzap_ent_t *mze; zfs_btree_clear(&zap->zap_m.zap_tree);
void *avlcookie = NULL; zfs_btree_destroy(&zap->zap_m.zap_tree);
while ((mze = avl_destroy_nodes(&zap->zap_m.zap_avl, &avlcookie)))
kmem_free(mze, sizeof (mzap_ent_t));
avl_destroy(&zap->zap_m.zap_avl);
} }
static zap_t * static zap_t *
@ -448,21 +452,26 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
zap->zap_salt = zap_m_phys(zap)->mz_salt; zap->zap_salt = zap_m_phys(zap)->mz_salt;
zap->zap_normflags = zap_m_phys(zap)->mz_normflags; zap->zap_normflags = zap_m_phys(zap)->mz_normflags;
zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1; zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1;
avl_create(&zap->zap_m.zap_avl, mze_compare,
sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node));
for (int i = 0; i < zap->zap_m.zap_num_chunks; i++) { /*
* Reduce B-tree leaf from 4KB to 512 bytes to reduce memmove()
* overhead on massive inserts below. It still allows to store
* 62 entries before we have to add 2KB B-tree core node.
*/
zfs_btree_create_custom(&zap->zap_m.zap_tree, mze_compare,
sizeof (mzap_ent_t), 512);
zap_name_t *zn = zap_name_alloc(zap);
for (uint16_t i = 0; i < zap->zap_m.zap_num_chunks; i++) {
mzap_ent_phys_t *mze = mzap_ent_phys_t *mze =
&zap_m_phys(zap)->mz_chunk[i]; &zap_m_phys(zap)->mz_chunk[i];
if (mze->mze_name[0]) { if (mze->mze_name[0]) {
zap_name_t *zn;
zap->zap_m.zap_num_entries++; zap->zap_m.zap_num_entries++;
zn = zap_name_alloc(zap, mze->mze_name, 0); zap_name_init_str(zn, mze->mze_name, 0);
mze_insert(zap, i, zn->zn_hash); mze_insert(zap, i, zn->zn_hash);
}
}
zap_name_free(zn); zap_name_free(zn);
}
}
} else { } else {
zap->zap_salt = zap_f_phys(zap)->zap_salt; zap->zap_salt = zap_f_phys(zap)->zap_salt;
zap->zap_normflags = zap_f_phys(zap)->zap_normflags; zap->zap_normflags = zap_f_phys(zap)->zap_normflags;
@ -657,24 +666,25 @@ mzap_upgrade(zap_t **zapp, const void *tag, dmu_tx_t *tx, zap_flags_t flags)
dprintf("upgrading obj=%llu with %u chunks\n", dprintf("upgrading obj=%llu with %u chunks\n",
(u_longlong_t)zap->zap_object, nchunks); (u_longlong_t)zap->zap_object, nchunks);
/* XXX destroy the avl later, so we can use the stored hash value */ /* XXX destroy the tree later, so we can use the stored hash value */
mze_destroy(zap); mze_destroy(zap);
fzap_upgrade(zap, tx, flags); fzap_upgrade(zap, tx, flags);
zap_name_t *zn = zap_name_alloc(zap);
for (int i = 0; i < nchunks; i++) { for (int i = 0; i < nchunks; i++) {
mzap_ent_phys_t *mze = &mzp->mz_chunk[i]; mzap_ent_phys_t *mze = &mzp->mz_chunk[i];
if (mze->mze_name[0] == 0) if (mze->mze_name[0] == 0)
continue; continue;
dprintf("adding %s=%llu\n", dprintf("adding %s=%llu\n",
mze->mze_name, (u_longlong_t)mze->mze_value); mze->mze_name, (u_longlong_t)mze->mze_value);
zap_name_t *zn = zap_name_alloc(zap, mze->mze_name, 0); zap_name_init_str(zn, mze->mze_name, 0);
/* If we fail here, we would end up losing entries */ /* If we fail here, we would end up losing entries */
VERIFY0(fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd, VERIFY0(fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd,
tag, tx)); tag, tx));
zap = zn->zn_zap; /* fzap_add_cd() may change zap */ zap = zn->zn_zap; /* fzap_add_cd() may change zap */
zap_name_free(zn);
} }
zap_name_free(zn);
vmem_free(mzp, sz); vmem_free(mzp, sz);
*zapp = zap; *zapp = zap;
return (0); return (0);
@ -916,22 +926,23 @@ zap_count(objset_t *os, uint64_t zapobj, uint64_t *count)
* See also the comment above zap_entry_normalization_conflict(). * See also the comment above zap_entry_normalization_conflict().
*/ */
static boolean_t static boolean_t
mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze) mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze,
zfs_btree_index_t *idx)
{ {
int direction = AVL_BEFORE;
boolean_t allocdzn = B_FALSE; boolean_t allocdzn = B_FALSE;
mzap_ent_t *other;
zfs_btree_index_t oidx;
if (zap->zap_normflags == 0) if (zap->zap_normflags == 0)
return (B_FALSE); return (B_FALSE);
again: for (other = zfs_btree_prev(&zap->zap_m.zap_tree, idx, &oidx);
for (mzap_ent_t *other = avl_walk(&zap->zap_m.zap_avl, mze, direction);
other && other->mze_hash == mze->mze_hash; other && other->mze_hash == mze->mze_hash;
other = avl_walk(&zap->zap_m.zap_avl, other, direction)) { other = zfs_btree_prev(&zap->zap_m.zap_tree, &oidx, &oidx)) {
if (zn == NULL) { if (zn == NULL) {
zn = zap_name_alloc(zap, MZE_PHYS(zap, mze)->mze_name, zn = zap_name_alloc_str(zap,
MT_NORMALIZE); MZE_PHYS(zap, mze)->mze_name, MT_NORMALIZE);
allocdzn = B_TRUE; allocdzn = B_TRUE;
} }
if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) { if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) {
@ -941,9 +952,20 @@ again:
} }
} }
if (direction == AVL_BEFORE) { for (other = zfs_btree_next(&zap->zap_m.zap_tree, idx, &oidx);
direction = AVL_AFTER; other && other->mze_hash == mze->mze_hash;
goto again; other = zfs_btree_next(&zap->zap_m.zap_tree, &oidx, &oidx)) {
if (zn == NULL) {
zn = zap_name_alloc_str(zap,
MZE_PHYS(zap, mze)->mze_name, MT_NORMALIZE);
allocdzn = B_TRUE;
}
if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) {
if (allocdzn)
zap_name_free(zn);
return (B_TRUE);
}
} }
if (allocdzn) if (allocdzn)
@ -971,7 +993,7 @@ zap_lookup_impl(zap_t *zap, const char *name,
{ {
int err = 0; int err = 0;
zap_name_t *zn = zap_name_alloc(zap, name, mt); zap_name_t *zn = zap_name_alloc_str(zap, name, mt);
if (zn == NULL) if (zn == NULL)
return (SET_ERROR(ENOTSUP)); return (SET_ERROR(ENOTSUP));
@ -979,7 +1001,8 @@ zap_lookup_impl(zap_t *zap, const char *name,
err = fzap_lookup(zn, integer_size, num_integers, buf, err = fzap_lookup(zn, integer_size, num_integers, buf,
realname, rn_len, ncp); realname, rn_len, ncp);
} else { } else {
mzap_ent_t *mze = mze_find(zn); zfs_btree_index_t idx;
mzap_ent_t *mze = mze_find(zn, &idx);
if (mze == NULL) { if (mze == NULL) {
err = SET_ERROR(ENOENT); err = SET_ERROR(ENOENT);
} else { } else {
@ -996,7 +1019,7 @@ zap_lookup_impl(zap_t *zap, const char *name,
rn_len); rn_len);
if (ncp) { if (ncp) {
*ncp = mzap_normalization_conflict(zap, *ncp = mzap_normalization_conflict(zap,
zn, mze); zn, mze, &idx);
} }
} }
} }
@ -1033,7 +1056,7 @@ zap_prefetch(objset_t *os, uint64_t zapobj, const char *name)
err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
if (err) if (err)
return (err); return (err);
zn = zap_name_alloc(zap, name, 0); zn = zap_name_alloc_str(zap, name, 0);
if (zn == NULL) { if (zn == NULL) {
zap_unlockdir(zap, FTAG); zap_unlockdir(zap, FTAG);
return (SET_ERROR(ENOTSUP)); return (SET_ERROR(ENOTSUP));
@ -1136,7 +1159,7 @@ zap_length(objset_t *os, uint64_t zapobj, const char *name,
zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
if (err != 0) if (err != 0)
return (err); return (err);
zap_name_t *zn = zap_name_alloc(zap, name, 0); zap_name_t *zn = zap_name_alloc_str(zap, name, 0);
if (zn == NULL) { if (zn == NULL) {
zap_unlockdir(zap, FTAG); zap_unlockdir(zap, FTAG);
return (SET_ERROR(ENOTSUP)); return (SET_ERROR(ENOTSUP));
@ -1144,7 +1167,8 @@ zap_length(objset_t *os, uint64_t zapobj, const char *name,
if (!zap->zap_ismicro) { if (!zap->zap_ismicro) {
err = fzap_length(zn, integer_size, num_integers); err = fzap_length(zn, integer_size, num_integers);
} else { } else {
mzap_ent_t *mze = mze_find(zn); zfs_btree_index_t idx;
mzap_ent_t *mze = mze_find(zn, &idx);
if (mze == NULL) { if (mze == NULL) {
err = SET_ERROR(ENOENT); err = SET_ERROR(ENOENT);
} else { } else {
@ -1184,7 +1208,7 @@ static void
mzap_addent(zap_name_t *zn, uint64_t value) mzap_addent(zap_name_t *zn, uint64_t value)
{ {
zap_t *zap = zn->zn_zap; zap_t *zap = zn->zn_zap;
int start = zap->zap_m.zap_alloc_next; uint16_t start = zap->zap_m.zap_alloc_next;
ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
@ -1200,7 +1224,7 @@ mzap_addent(zap_name_t *zn, uint64_t value)
ASSERT(cd < zap_maxcd(zap)); ASSERT(cd < zap_maxcd(zap));
again: again:
for (int i = start; i < zap->zap_m.zap_num_chunks; i++) { for (uint16_t i = start; i < zap->zap_m.zap_num_chunks; i++) {
mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i]; mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i];
if (mze->mze_name[0] == 0) { if (mze->mze_name[0] == 0) {
mze->mze_value = value; mze->mze_value = value;
@ -1231,7 +1255,7 @@ zap_add_impl(zap_t *zap, const char *key,
const uint64_t *intval = val; const uint64_t *intval = val;
int err = 0; int err = 0;
zap_name_t *zn = zap_name_alloc(zap, key, 0); zap_name_t *zn = zap_name_alloc_str(zap, key, 0);
if (zn == NULL) { if (zn == NULL) {
zap_unlockdir(zap, tag); zap_unlockdir(zap, tag);
return (SET_ERROR(ENOTSUP)); return (SET_ERROR(ENOTSUP));
@ -1249,7 +1273,8 @@ zap_add_impl(zap_t *zap, const char *key,
} }
zap = zn->zn_zap; /* fzap_add() may change zap */ zap = zn->zn_zap; /* fzap_add() may change zap */
} else { } else {
if (mze_find(zn) != NULL) { zfs_btree_index_t idx;
if (mze_find(zn, &idx) != NULL) {
err = SET_ERROR(EEXIST); err = SET_ERROR(EEXIST);
} else { } else {
mzap_addent(zn, *intval); mzap_addent(zn, *intval);
@ -1329,7 +1354,7 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name,
zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
if (err != 0) if (err != 0)
return (err); return (err);
zap_name_t *zn = zap_name_alloc(zap, name, 0); zap_name_t *zn = zap_name_alloc_str(zap, name, 0);
if (zn == NULL) { if (zn == NULL) {
zap_unlockdir(zap, FTAG); zap_unlockdir(zap, FTAG);
return (SET_ERROR(ENOTSUP)); return (SET_ERROR(ENOTSUP));
@ -1350,7 +1375,8 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name,
} }
zap = zn->zn_zap; /* fzap_update() may change zap */ zap = zn->zn_zap; /* fzap_update() may change zap */
} else { } else {
mzap_ent_t *mze = mze_find(zn); zfs_btree_index_t idx;
mzap_ent_t *mze = mze_find(zn, &idx);
if (mze != NULL) { if (mze != NULL) {
MZE_PHYS(zap, mze)->mze_value = *intval; MZE_PHYS(zap, mze)->mze_value = *intval;
} else { } else {
@ -1400,20 +1426,20 @@ zap_remove_impl(zap_t *zap, const char *name,
{ {
int err = 0; int err = 0;
zap_name_t *zn = zap_name_alloc(zap, name, mt); zap_name_t *zn = zap_name_alloc_str(zap, name, mt);
if (zn == NULL) if (zn == NULL)
return (SET_ERROR(ENOTSUP)); return (SET_ERROR(ENOTSUP));
if (!zap->zap_ismicro) { if (!zap->zap_ismicro) {
err = fzap_remove(zn, tx); err = fzap_remove(zn, tx);
} else { } else {
mzap_ent_t *mze = mze_find(zn); zfs_btree_index_t idx;
mzap_ent_t *mze = mze_find(zn, &idx);
if (mze == NULL) { if (mze == NULL) {
err = SET_ERROR(ENOENT); err = SET_ERROR(ENOENT);
} else { } else {
zap->zap_m.zap_num_entries--; zap->zap_m.zap_num_entries--;
memset(&zap_m_phys(zap)->mz_chunk[mze->mze_chunkid], 0, memset(MZE_PHYS(zap, mze), 0, sizeof (mzap_ent_phys_t));
sizeof (mzap_ent_phys_t)); zfs_btree_remove_idx(&zap->zap_m.zap_tree, &idx);
mze_remove(zap, mze);
} }
} }
zap_name_free(zn); zap_name_free(zn);
@ -1584,29 +1610,30 @@ zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
if (!zc->zc_zap->zap_ismicro) { if (!zc->zc_zap->zap_ismicro) {
err = fzap_cursor_retrieve(zc->zc_zap, zc, za); err = fzap_cursor_retrieve(zc->zc_zap, zc, za);
} else { } else {
avl_index_t idx; zfs_btree_index_t idx;
mzap_ent_t mze_tofind; mzap_ent_t mze_tofind;
mze_tofind.mze_hash = zc->zc_hash; mze_tofind.mze_hash = zc->zc_hash >> 32;
mze_tofind.mze_cd = zc->zc_cd; mze_tofind.mze_cd = zc->zc_cd;
mzap_ent_t *mze = mzap_ent_t *mze = zfs_btree_find(&zc->zc_zap->zap_m.zap_tree,
avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx); &mze_tofind, &idx);
if (mze == NULL) { if (mze == NULL) {
mze = avl_nearest(&zc->zc_zap->zap_m.zap_avl, mze = zfs_btree_next(&zc->zc_zap->zap_m.zap_tree,
idx, AVL_AFTER); &idx, &idx);
} }
if (mze) { if (mze) {
mzap_ent_phys_t *mzep = MZE_PHYS(zc->zc_zap, mze); mzap_ent_phys_t *mzep = MZE_PHYS(zc->zc_zap, mze);
ASSERT3U(mze->mze_cd, ==, mzep->mze_cd); ASSERT3U(mze->mze_cd, ==, mzep->mze_cd);
za->za_normalization_conflict = za->za_normalization_conflict =
mzap_normalization_conflict(zc->zc_zap, NULL, mze); mzap_normalization_conflict(zc->zc_zap, NULL,
mze, &idx);
za->za_integer_length = 8; za->za_integer_length = 8;
za->za_num_integers = 1; za->za_num_integers = 1;
za->za_first_integer = mzep->mze_value; za->za_first_integer = mzep->mze_value;
(void) strlcpy(za->za_name, mzep->mze_name, (void) strlcpy(za->za_name, mzep->mze_name,
sizeof (za->za_name)); sizeof (za->za_name));
zc->zc_hash = mze->mze_hash; zc->zc_hash = (uint64_t)mze->mze_hash << 32;
zc->zc_cd = mze->mze_cd; zc->zc_cd = mze->mze_cd;
err = 0; err = 0;
} else { } else {