From 9925c28cdec943a6ffa81219cb469b727decf111 Mon Sep 17 00:00:00 2001 From: Alex Reece Date: Thu, 2 Apr 2015 02:10:58 +1100 Subject: [PATCH] Illumos 5095 - panic when adding a duplicate dbuf to dn_dbufs 5095 panic when adding a duplicate dbuf to dn_dbufs Author: Alex Reece Reviewed by: Adam Leventhal Reviewed by: George Wilson Reviewed by: Mattew Ahrens Reviewed by: Dan Kimmel Reviewed by: Dan McDonald Reviewed by: Josef Sipek Approved by: Robert Mustacchi References: https://www.illumos.org/issues/5095 https://github.com/illumos/illumos-gate/commit/86bb58a Ported-by: Chris Dunlop Signed-off-by: Brian Behlendorf --- include/sys/dbuf.h | 8 +++++--- include/sys/dnode.h | 13 ++++++++++++- module/zfs/dbuf.c | 4 +--- module/zfs/dnode.c | 36 +++++++++++++++++++----------------- 4 files changed, 37 insertions(+), 24 deletions(-) diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h index 2f593bb4d6..d253add59b 100644 --- a/include/sys/dbuf.h +++ b/include/sys/dbuf.h @@ -66,8 +66,13 @@ extern "C" { * | | * | | * +--------> NOFILL -------+ + * + * DB_SEARCH is an invalid state for a dbuf. It is used by dbuf_free_range + * to find all dbufs in a range of a dnode and must be less than any other + * dbuf_states_t (see comment on dn_dbufs in dnode.h). */ typedef enum dbuf_states { + DB_SEARCH = -1, DB_UNCACHED, DB_FILL, DB_NOFILL, @@ -213,9 +218,6 @@ typedef struct dmu_buf_impl { /* pointer to most recent dirty record for this buffer */ dbuf_dirty_record_t *db_last_dirty; - /* Creation time of dbuf (see comment in dbuf_compare). */ - hrtime_t db_creation; - /* * Our link on the owner dnodes's dn_dbufs list. * Protected by its dn_dbufs_mtx. diff --git a/include/sys/dnode.h b/include/sys/dnode.h index 2974a20dc6..90a334ba74 100644 --- a/include/sys/dnode.h +++ b/include/sys/dnode.h @@ -233,7 +233,18 @@ typedef struct dnode { refcount_t dn_holds; kmutex_t dn_dbufs_mtx; - avl_tree_t dn_dbufs; /* descendent dbufs */ + /* + * Descendent dbufs, ordered by dbuf_compare. Note that dn_dbufs + * can contain multiple dbufs of the same (level, blkid) when a + * dbuf is marked DB_EVICTING without being removed from + * dn_dbufs. To maintain the avl invariant that there cannot be + * duplicate entries, we order the dbufs by an arbitrary value - + * their address in memory. This means that dn_dbufs cannot be used to + * directly look up a dbuf. Instead, callers must use avl_walk, have + * a reference to the dbuf, or look up a non-existant node with + * db_state = DB_SEARCH (see dbuf_free_range for an example). + */ + avl_tree_t dn_dbufs; /* protected by dn_struct_rwlock */ struct dmu_buf_impl *dn_bonus; /* bonus buffer dbuf */ diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index e9c8580fc3..e6e24e0e97 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -94,8 +94,6 @@ dbuf_cons(void *vdb, void *unused, int kmflag) cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); refcount_create(&db->db_holds); - db->db_creation = gethrtime(); - return (0); } @@ -884,7 +882,7 @@ dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, db_seach = kmem_alloc(sizeof (dmu_buf_impl_t), KM_SLEEP); db_search->db_level = 0; db_search->db_blkid = start_blkid; - db_search->db_creation = 0; + db_search->db_state = DB_SEARCH; mutex_enter(&dn->dn_dbufs_mtx); if (start_blkid >= dn->dn_unlisted_l0_blkid && !freespill) { diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index 7c28dc64d1..2b022860e3 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -70,33 +70,35 @@ dbuf_compare(const void *x1, const void *x2) if (d1->db_level < d2->db_level) { return (-1); - } else if (d1->db_level > d2->db_level) { + } + if (d1->db_level > d2->db_level) { return (1); } if (d1->db_blkid < d2->db_blkid) { return (-1); - } else if (d1->db_blkid > d2->db_blkid) { + } + if (d1->db_blkid > d2->db_blkid) { return (1); } - /* - * If a dbuf is being evicted while dn_dbufs_mutex is not held, we set - * the db_state to DB_EVICTING but do not remove it from dn_dbufs. If - * another thread creates a dbuf of the same blkid before the dbuf is - * removed from dn_dbufs, we can reach a state where there are two - * dbufs of the same blkid and level in db_dbufs. To maintain the avl - * invariant that there cannot be duplicate items, we distinguish - * between these two dbufs based on the time they were created. - */ - if (d1->db_creation < d2->db_creation) { + if (d1->db_state < d2->db_state) { return (-1); - } else if (d1->db_creation > d2->db_creation) { - return (1); - } else { - ASSERT3P(d1, ==, d2); - return (0); } + if (d1->db_state > d2->db_state) { + return (1); + } + + ASSERT3S(d1->db_state, !=, DB_SEARCH); + ASSERT3S(d2->db_state, !=, DB_SEARCH); + + if ((uintptr_t)d1 < (uintptr_t)d2) { + return (-1); + } + if ((uintptr_t)d1 > (uintptr_t)d2) { + return (1); + } + return (0); } /* ARGSUSED */