From 60d25f4b938f4a2dec8f9d5e479ff968060ce1aa Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Thu, 19 Mar 2009 15:22:48 -0700 Subject: [PATCH] The patch resolves the extra call to dnode_cons() in dnode_create(). The extra call to the constructor was there to reinitialize the non- trivial primatives in the dnode (lists, mutexs, condvars, avl tree, etc). This was safe, although not exactly clean, on Solaris because none of the primitives allocate memory. In the Linux port this is not true. To keep stack usage to a minimum several of the primatives dynamically allocate memory thus initializing them twice results in a memory leak. This patch resolves this problem for Solaris and Linux by ensuring all *_inits are called in the constructor, and all *_destroys are called in the destructor. Additionally we ensure that all dnode objects are properly deconstructed before being freed to the slab, and when the objects are allocated from the slab all required data members are explicity initialized to correct values. --- .topmsg | 15 ++++++- module/zfs/dmu_zfetch.c | 67 ++++++++++++++++------------- module/zfs/dnode.c | 52 +++++++++++++++++++++- module/zfs/include/sys/dmu_zfetch.h | 3 ++ module/zfs/include/sys/dnode.h | 3 ++ 5 files changed, 107 insertions(+), 33 deletions(-) diff --git a/.topmsg b/.topmsg index 83b3e71cc9..e8212e012c 100644 --- a/.topmsg +++ b/.topmsg @@ -1,7 +1,20 @@ From: Brian Behlendorf Subject: [PATCH] fix dnode constructor -Don't call the constructor twice +The patch removes the extra call to dnode_cons() in dnode_create(). The +extra call to the constructor was there to reinitialize the non-trivial +data structures in the dnode (lists, mutexs, condvars, avl tree, etc). +This was safe, although not exactly clean, on Solaris because none of +the primitives allocate memory. In the Linux port this is not true. +To keep stack usage to a minimum several of the primatives dynamically +allocate memory thus initializing them twice results in a memory leak. + +This patch resolves this problem for Solaris and Linux by ensures all +*_init's are called in the constructor, and all *_destroy's are called +in the destructor. Additionally we ensure that all dnode objects are +properly deconstructed before being freed to the slab, and when the +objects are allocated from the slab all required data members are +explicity initialized to correct values. Signed-off-by: Brian Behlendorf diff --git a/module/zfs/dmu_zfetch.c b/module/zfs/dmu_zfetch.c index 4d79fe98e1..5ba366c449 100644 --- a/module/zfs/dmu_zfetch.c +++ b/module/zfs/dmu_zfetch.c @@ -203,20 +203,53 @@ dmu_zfetch_dofetch(zfetch_t *zf, zstream_t *zs) void dmu_zfetch_init(zfetch_t *zf, dnode_t *dno) { - if (zf == NULL) { - return; - } - zf->zf_dnode = dno; zf->zf_stream_cnt = 0; zf->zf_alloc_fail = 0; +} +/* + * Clean-up state associated with a zfetch structure. This frees allocated + * structure members, empties the zf_stream tree, and generally makes things + * nice. This doesn't free the zfetch_t itself, that's left to the caller. + */ +void +dmu_zfetch_rele(zfetch_t *zf) +{ + zstream_t *zs; + zstream_t *zs_next; + + for (zs = list_head(&zf->zf_stream); zs; zs = zs_next) { + zs_next = list_next(&zf->zf_stream, zs); + + list_remove(&zf->zf_stream, zs); + mutex_destroy(&zs->zst_lock); + kmem_free(zs, sizeof (zstream_t)); + } +} + +/* + * Construct a zfetch structure. + */ +void +dmu_zfetch_cons(zfetch_t *zf) +{ list_create(&zf->zf_stream, sizeof (zstream_t), offsetof(zstream_t, zst_node)); rw_init(&zf->zf_rwlock, NULL, RW_DEFAULT, NULL); } +/* + * Destruct a zfetch structure. + */ +void +dmu_zfetch_dest(zfetch_t *zf) +{ + list_destroy(&zf->zf_stream); + rw_destroy(&zf->zf_rwlock); +} + /* * This function computes the actual size, in blocks, that can be prefetched, * and fetches it. @@ -441,32 +474,6 @@ out: return (rc); } -/* - * Clean-up state associated with a zfetch structure. This frees allocated - * structure members, empties the zf_stream tree, and generally makes things - * nice. This doesn't free the zfetch_t itself, that's left to the caller. - */ -void -dmu_zfetch_rele(zfetch_t *zf) -{ - zstream_t *zs; - zstream_t *zs_next; - - ASSERT(!RW_LOCK_HELD(&zf->zf_rwlock)); - - for (zs = list_head(&zf->zf_stream); zs; zs = zs_next) { - zs_next = list_next(&zf->zf_stream, zs); - - list_remove(&zf->zf_stream, zs); - mutex_destroy(&zs->zst_lock); - kmem_free(zs, sizeof (zstream_t)); - } - list_destroy(&zf->zf_stream); - rw_destroy(&zf->zf_rwlock); - - zf->zf_dnode = NULL; -} - /* * Given a zfetch and zstream structure, insert the zstream structure into the * AVL tree contained within the zfetch structure. Peform the appropriate diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index b00a8c4a71..384a5bea9f 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -72,6 +72,7 @@ dnode_cons(void *arg, void *unused, int kmflag) list_create(&dn->dn_dbufs, sizeof (dmu_buf_impl_t), offsetof(dmu_buf_impl_t, db_link)); + dmu_zfetch_cons(&dn->dn_zfetch); return (0); } @@ -96,6 +97,7 @@ dnode_dest(void *arg, void *unused) } list_destroy(&dn->dn_dbufs); + dmu_zfetch_dest(&dn->dn_zfetch); } void @@ -165,6 +167,27 @@ dnode_verify(dnode_t *dn) if (drop_struct_lock) rw_exit(&dn->dn_struct_rwlock); } + +void +dnode_verify_clean(dnode_t *dn) +{ + int i; + + ASSERT(!RW_LOCK_HELD(&dn->dn_struct_rwlock)); + ASSERT(!MUTEX_HELD(&dn->dn_mtx)); + ASSERT(!MUTEX_HELD(&dn->dn_dbufs_mtx)); + ASSERT(refcount_is_zero(&dn->dn_holds)); + ASSERT(refcount_is_zero(&dn->dn_tx_holds)); + + for (i = 0; i < TXG_SIZE; i++) { + ASSERT(NULL == list_head(&dn->dn_dirty_records[i])); + ASSERT(0 == avl_numnodes(&dn->dn_ranges[i])); + } + + ASSERT(NULL == list_head(&dn->dn_dbufs)); + ASSERT(NULL == list_head(&dn->dn_zfetch.zf_stream)); + ASSERT(!RW_LOCK_HELD(&dn->dn_zfetch.zf_rwlock)); +} #endif void @@ -276,14 +299,32 @@ dnode_create(objset_impl_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db, uint64_t object) { dnode_t *dn = kmem_cache_alloc(dnode_cache, KM_SLEEP); + int i; + DNODE_VERIFY_CLEAN(dn); dn->dn_objset = os; dn->dn_object = object; dn->dn_dbuf = db; dn->dn_phys = dnp; - if (dnp->dn_datablkszsec) + list_link_init(&dn->dn_link); + for (i = 0; i < TXG_SIZE; i++) { + list_link_init(&dn->dn_dirty_link[i]); + dn->dn_next_nblkptr[i] = 0; + dn->dn_next_nlevels[i] = 0; + dn->dn_next_indblkshift[i] = 0; + dn->dn_next_bonuslen[i] = 0; + dn->dn_next_blksz[i] = 0; + } + + if (dnp->dn_datablkszsec) { dnode_setdblksz(dn, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); + } else { + dn->dn_datablksz = 0; + dn->dn_datablkszsec = 0; + dn->dn_datablkshift = 0; + } + dn->dn_indblkshift = dnp->dn_indblkshift; dn->dn_nlevels = dnp->dn_nlevels; dn->dn_type = dnp->dn_type; @@ -293,7 +334,13 @@ dnode_create(objset_impl_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db, dn->dn_bonustype = dnp->dn_bonustype; dn->dn_bonuslen = dnp->dn_bonuslen; dn->dn_maxblkid = dnp->dn_maxblkid; - + dn->dn_allocated_txg = 0; + dn->dn_free_txg = 0; + dn->dn_assigned_txg = 0; + dn->dn_dirtyctx = DN_UNDIRTIED; + dn->dn_dirtyctx_firstset = NULL; + dn->dn_bonus = NULL; + dn->dn_zio = NULL; dmu_zfetch_init(&dn->dn_zfetch, dn); ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES); @@ -335,6 +382,7 @@ dnode_destroy(dnode_t *dn) dbuf_evict(dn->dn_bonus); dn->dn_bonus = NULL; } + DNODE_VERIFY_CLEAN(dn); kmem_cache_free(dnode_cache, dn); arc_space_return(sizeof (dnode_t), ARC_SPACE_OTHER); } diff --git a/module/zfs/include/sys/dmu_zfetch.h b/module/zfs/include/sys/dmu_zfetch.h index c94bced933..97b6738522 100644 --- a/module/zfs/include/sys/dmu_zfetch.h +++ b/module/zfs/include/sys/dmu_zfetch.h @@ -65,6 +65,9 @@ typedef struct zfetch { void dmu_zfetch_init(zfetch_t *, struct dnode *); void dmu_zfetch_rele(zfetch_t *); +void dmu_zfetch_cons(zfetch_t *); +void dmu_zfetch_dest(zfetch_t *); + void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, int); diff --git a/module/zfs/include/sys/dnode.h b/module/zfs/include/sys/dnode.h index be9e569083..f802a480f8 100644 --- a/module/zfs/include/sys/dnode.h +++ b/module/zfs/include/sys/dnode.h @@ -222,6 +222,7 @@ void dnode_free(dnode_t *dn, dmu_tx_t *tx); void dnode_byteswap(dnode_phys_t *dnp); void dnode_buf_byteswap(void *buf, size_t size); void dnode_verify(dnode_t *dn); +void dnode_verify_clean(dnode_t *dn); int dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx); uint64_t dnode_current_max_length(dnode_t *dn); void dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx); @@ -259,12 +260,14 @@ void dnode_evict_dbufs(dnode_t *dn); _NOTE(CONSTCOND) } while (0) #define DNODE_VERIFY(dn) dnode_verify(dn) +#define DNODE_VERIFY_CLEAN(dn) dnode_verify_clean(dn) #define FREE_VERIFY(db, start, end, tx) free_verify(db, start, end, tx) #else #define dprintf_dnode(db, fmt, ...) #define DNODE_VERIFY(dn) +#define DNODE_VERIFY_CLEAN(dn) #define FREE_VERIFY(db, start, end, tx) #endif