OpenZFS 9577 - remove zfs_dbuf_evict_key tsd

The zfs_dbuf_evict_key TSD (thread-specific data) is not necessary -
we can instead pass a flag down in a few places to prevent recursive
dbuf eviction. Making this change has 3 benefits:

1. The code semantics are easier to understand.
2. On Linux, performance is improved, because creating/removing
   TSD values (by setting to NULL vs non-NULL) is expensive, and
   we do it very often.
3. According to Nexenta, the current semantics can cause a
   deadlock when concurrently calling dmu_objset_evict_dbufs()
   (which is rare today, but they are working on a "parallel
   unmount" change that triggers this more easily):

Porting Notes:
* Minor conflict with OpenZFS 9337 which has not yet been ported.

Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Serapheim Dimitropoulos <serapheim.dimitro@delphix.com>
Reviewed by: Brad Lewis <brad.lewis@delphix.com>
Reviewed-by: George Melikov <mail@gmelikov.ru>
Ported-by: Brian Behlendorf <behlendorf1@llnl.gov>

OpenZFS-issue: https://illumos.org/issues/9577
OpenZFS-commit: https://github.com/openzfs/openzfs/pull/645
External-issue: DLPX-58547
Closes #7602
This commit is contained in:
Matthew Ahrens 2018-05-31 10:29:12 -07:00 committed by Brian Behlendorf
parent 232dd8b956
commit 1fac63e56f
5 changed files with 46 additions and 55 deletions

View File

@ -20,7 +20,7 @@
*/ */
/* /*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/ */
@ -305,7 +305,7 @@ boolean_t dbuf_try_add_ref(dmu_buf_t *db, objset_t *os, uint64_t obj,
uint64_t dbuf_refcount(dmu_buf_impl_t *db); uint64_t dbuf_refcount(dmu_buf_impl_t *db);
void dbuf_rele(dmu_buf_impl_t *db, void *tag); void dbuf_rele(dmu_buf_impl_t *db, void *tag);
void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag); void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting);
dmu_buf_impl_t *dbuf_find(struct objset *os, uint64_t object, uint8_t level, dmu_buf_impl_t *dbuf_find(struct objset *os, uint64_t object, uint8_t level,
uint64_t blkid); uint64_t blkid);

View File

@ -20,7 +20,7 @@
*/ */
/* /*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2017 by Delphix. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/ */
@ -408,7 +408,7 @@ int dnode_hold_impl(struct objset *dd, uint64_t object, int flag, int dn_slots,
void *ref, dnode_t **dnp); void *ref, dnode_t **dnp);
boolean_t dnode_add_ref(dnode_t *dn, void *ref); boolean_t dnode_add_ref(dnode_t *dn, void *ref);
void dnode_rele(dnode_t *dn, void *ref); void dnode_rele(dnode_t *dn, void *ref);
void dnode_rele_and_unlock(dnode_t *dn, void *tag); void dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting);
void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx); void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx);
void dnode_sync(dnode_t *dn, dmu_tx_t *tx); void dnode_sync(dnode_t *dn, dmu_tx_t *tx);
void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,

View File

@ -155,8 +155,6 @@ static void __dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh,
void *tag, dmu_buf_impl_t **dbp, int depth); void *tag, dmu_buf_impl_t **dbp, int depth);
static int __dbuf_hold_impl(struct dbuf_hold_impl_data *dh); static int __dbuf_hold_impl(struct dbuf_hold_impl_data *dh);
uint_t zfs_dbuf_evict_key;
static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
@ -596,14 +594,6 @@ dbuf_evict_one(void)
ASSERT(!MUTEX_HELD(&dbuf_evict_lock)); ASSERT(!MUTEX_HELD(&dbuf_evict_lock));
/*
* Set the thread's tsd to indicate that it's processing evictions.
* Once a thread stops evicting from the dbuf cache it will
* reset its tsd to NULL.
*/
ASSERT3P(tsd_get(zfs_dbuf_evict_key), ==, NULL);
(void) tsd_set(zfs_dbuf_evict_key, (void *)B_TRUE);
dmu_buf_impl_t *db = multilist_sublist_tail(mls); dmu_buf_impl_t *db = multilist_sublist_tail(mls);
while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) { while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) {
db = multilist_sublist_prev(mls, db); db = multilist_sublist_prev(mls, db);
@ -628,7 +618,6 @@ dbuf_evict_one(void)
} else { } else {
multilist_sublist_unlock(mls); multilist_sublist_unlock(mls);
} }
(void) tsd_set(zfs_dbuf_evict_key, NULL);
} }
/* /*
@ -682,29 +671,6 @@ dbuf_evict_thread(void *unused)
static void static void
dbuf_evict_notify(void) dbuf_evict_notify(void)
{ {
/*
* We use thread specific data to track when a thread has
* started processing evictions. This allows us to avoid deeply
* nested stacks that would have a call flow similar to this:
*
* dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify()
* ^ |
* | |
* +-----dbuf_destroy()<--dbuf_evict_one()<--------+
*
* The dbuf_eviction_thread will always have its tsd set until
* that thread exits. All other threads will only set their tsd
* if they are participating in the eviction process. This only
* happens if the eviction thread is unable to process evictions
* fast enough. To keep the dbuf cache size in check, other threads
* can evict from the dbuf cache directly. Those threads will set
* their tsd values so that we ensure that they only evict one dbuf
* from the dbuf cache.
*/
if (tsd_get(zfs_dbuf_evict_key) != NULL)
return;
/* /*
* We check if we should evict without holding the dbuf_evict_lock, * We check if we should evict without holding the dbuf_evict_lock,
* because it's OK to occasionally make the wrong decision here, * because it's OK to occasionally make the wrong decision here,
@ -801,7 +767,6 @@ retry:
dbuf_cache_multilist_index_func); dbuf_cache_multilist_index_func);
refcount_create(&dbuf_cache_size); refcount_create(&dbuf_cache_size);
tsd_create(&zfs_dbuf_evict_key, NULL);
dbuf_evict_thread_exit = B_FALSE; dbuf_evict_thread_exit = B_FALSE;
mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL); cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL);
@ -858,7 +823,6 @@ dbuf_fini(void)
cv_wait(&dbuf_evict_cv, &dbuf_evict_lock); cv_wait(&dbuf_evict_cv, &dbuf_evict_lock);
} }
mutex_exit(&dbuf_evict_lock); mutex_exit(&dbuf_evict_lock);
tsd_destroy(&zfs_dbuf_evict_key);
mutex_destroy(&dbuf_evict_lock); mutex_destroy(&dbuf_evict_lock);
cv_destroy(&dbuf_evict_cv); cv_destroy(&dbuf_evict_cv);
@ -1152,7 +1116,7 @@ dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
db->db_state = DB_UNCACHED; db->db_state = DB_UNCACHED;
} }
cv_broadcast(&db->db_changed); cv_broadcast(&db->db_changed);
dbuf_rele_and_unlock(db, NULL); dbuf_rele_and_unlock(db, NULL, B_FALSE);
} }
static int static int
@ -2441,7 +2405,8 @@ dbuf_destroy(dmu_buf_impl_t *db)
* value in dnode_move(), since DB_DNODE_EXIT doesn't actually * value in dnode_move(), since DB_DNODE_EXIT doesn't actually
* release any lock. * release any lock.
*/ */
dnode_rele(dn, db); mutex_enter(&dn->dn_mtx);
dnode_rele_and_unlock(dn, db, B_TRUE);
db->db_dnode_handle = NULL; db->db_dnode_handle = NULL;
dbuf_hash_remove(db); dbuf_hash_remove(db);
@ -2467,8 +2432,10 @@ dbuf_destroy(dmu_buf_impl_t *db)
* If this dbuf is referenced from an indirect dbuf, * If this dbuf is referenced from an indirect dbuf,
* decrement the ref count on the indirect dbuf. * decrement the ref count on the indirect dbuf.
*/ */
if (parent && parent != dndb) if (parent && parent != dndb) {
dbuf_rele(parent, db); mutex_enter(&parent->db_mtx);
dbuf_rele_and_unlock(parent, db, B_TRUE);
}
} }
/* /*
@ -3195,7 +3162,7 @@ void
dbuf_rele(dmu_buf_impl_t *db, void *tag) dbuf_rele(dmu_buf_impl_t *db, void *tag)
{ {
mutex_enter(&db->db_mtx); mutex_enter(&db->db_mtx);
dbuf_rele_and_unlock(db, tag); dbuf_rele_and_unlock(db, tag, B_FALSE);
} }
void void
@ -3206,10 +3173,19 @@ dmu_buf_rele(dmu_buf_t *db, void *tag)
/* /*
* dbuf_rele() for an already-locked dbuf. This is necessary to allow * dbuf_rele() for an already-locked dbuf. This is necessary to allow
* db_dirtycnt and db_holds to be updated atomically. * db_dirtycnt and db_holds to be updated atomically. The 'evicting'
* argument should be set if we are already in the dbuf-evicting code
* path, in which case we don't want to recursively evict. This allows us to
* avoid deeply nested stacks that would have a call flow similar to this:
*
* dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify()
* ^ |
* | |
* +-----dbuf_destroy()<--dbuf_evict_one()<--------+
*
*/ */
void void
dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting)
{ {
int64_t holds; int64_t holds;
@ -3310,6 +3286,7 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
refcount_count(&dbuf_cache_size)); refcount_count(&dbuf_cache_size));
mutex_exit(&db->db_mtx); mutex_exit(&db->db_mtx);
if (!evicting)
dbuf_evict_notify(); dbuf_evict_notify();
} }
@ -3647,7 +3624,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
kmem_free(dr, sizeof (dbuf_dirty_record_t)); kmem_free(dr, sizeof (dbuf_dirty_record_t));
ASSERT(db->db_dirtycnt > 0); ASSERT(db->db_dirtycnt > 0);
db->db_dirtycnt -= 1; db->db_dirtycnt -= 1;
dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg, B_FALSE);
return; return;
} }
@ -4020,7 +3997,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
ASSERT(db->db_dirtycnt > 0); ASSERT(db->db_dirtycnt > 0);
db->db_dirtycnt -= 1; db->db_dirtycnt -= 1;
db->db_data_pending = NULL; db->db_data_pending = NULL;
dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg); dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE);
} }
static void static void

View File

@ -1571,11 +1571,11 @@ void
dnode_rele(dnode_t *dn, void *tag) dnode_rele(dnode_t *dn, void *tag)
{ {
mutex_enter(&dn->dn_mtx); mutex_enter(&dn->dn_mtx);
dnode_rele_and_unlock(dn, tag); dnode_rele_and_unlock(dn, tag, B_FALSE);
} }
void void
dnode_rele_and_unlock(dnode_t *dn, void *tag) dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting)
{ {
uint64_t refs; uint64_t refs;
/* Get while the hold prevents the dnode from moving. */ /* Get while the hold prevents the dnode from moving. */
@ -1606,7 +1606,8 @@ dnode_rele_and_unlock(dnode_t *dn, void *tag)
* that the handle has zero references, but that will be * that the handle has zero references, but that will be
* asserted anyway when the handle gets destroyed. * asserted anyway when the handle gets destroyed.
*/ */
dbuf_rele(db, dnh); mutex_enter(&db->db_mtx);
dbuf_rele_and_unlock(db, dnh, evicting);
} }
} }

View File

@ -21,7 +21,7 @@
/* /*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2017 by Delphix. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/ */
@ -424,6 +424,19 @@ dnode_evict_dbufs(dnode_t *dn)
avl_insert_here(&dn->dn_dbufs, db_marker, db, avl_insert_here(&dn->dn_dbufs, db_marker, db,
AVL_BEFORE); AVL_BEFORE);
/*
* We need to use the "marker" dbuf rather than
* simply getting the next dbuf, because
* dbuf_destroy() may actually remove multiple dbufs.
* It can call itself recursively on the parent dbuf,
* which may also be removed from dn_dbufs. The code
* flow would look like:
*
* dbuf_destroy():
* dnode_rele_and_unlock(parent_dbuf, evicting=TRUE):
* if (!cacheable || pending_evict)
* dbuf_destroy()
*/
dbuf_destroy(db); dbuf_destroy(db);
db_next = AVL_NEXT(&dn->dn_dbufs, db_marker); db_next = AVL_NEXT(&dn->dn_dbufs, db_marker);
@ -484,7 +497,7 @@ dnode_undirty_dbufs(list_t *list)
list_destroy(&dr->dt.di.dr_children); list_destroy(&dr->dt.di.dr_children);
} }
kmem_free(dr, sizeof (dbuf_dirty_record_t)); kmem_free(dr, sizeof (dbuf_dirty_record_t));
dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg, B_FALSE);
} }
} }