OpenZFS 9577 - remove zfs_dbuf_evict_key tsd
The zfs_dbuf_evict_key TSD (thread-specific data) is not necessary - we can instead pass a flag down in a few places to prevent recursive dbuf eviction. Making this change has 3 benefits: 1. The code semantics are easier to understand. 2. On Linux, performance is improved, because creating/removing TSD values (by setting to NULL vs non-NULL) is expensive, and we do it very often. 3. According to Nexenta, the current semantics can cause a deadlock when concurrently calling dmu_objset_evict_dbufs() (which is rare today, but they are working on a "parallel unmount" change that triggers this more easily): Porting Notes: * Minor conflict with OpenZFS 9337 which has not yet been ported. Authored by: Matthew Ahrens <mahrens@delphix.com> Reviewed by: George Wilson <george.wilson@delphix.com> Reviewed by: Serapheim Dimitropoulos <serapheim.dimitro@delphix.com> Reviewed by: Brad Lewis <brad.lewis@delphix.com> Reviewed-by: George Melikov <mail@gmelikov.ru> Ported-by: Brian Behlendorf <behlendorf1@llnl.gov> OpenZFS-issue: https://illumos.org/issues/9577 OpenZFS-commit: https://github.com/openzfs/openzfs/pull/645 External-issue: DLPX-58547 Closes #7602
This commit is contained in:
parent
232dd8b956
commit
1fac63e56f
|
@ -20,7 +20,7 @@
|
||||||
*/
|
*/
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||||
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
|
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
|
||||||
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
|
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
|
||||||
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
|
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
|
||||||
*/
|
*/
|
||||||
|
@ -305,7 +305,7 @@ boolean_t dbuf_try_add_ref(dmu_buf_t *db, objset_t *os, uint64_t obj,
|
||||||
uint64_t dbuf_refcount(dmu_buf_impl_t *db);
|
uint64_t dbuf_refcount(dmu_buf_impl_t *db);
|
||||||
|
|
||||||
void dbuf_rele(dmu_buf_impl_t *db, void *tag);
|
void dbuf_rele(dmu_buf_impl_t *db, void *tag);
|
||||||
void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag);
|
void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting);
|
||||||
|
|
||||||
dmu_buf_impl_t *dbuf_find(struct objset *os, uint64_t object, uint8_t level,
|
dmu_buf_impl_t *dbuf_find(struct objset *os, uint64_t object, uint8_t level,
|
||||||
uint64_t blkid);
|
uint64_t blkid);
|
||||||
|
|
|
@ -20,7 +20,7 @@
|
||||||
*/
|
*/
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||||
* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
|
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
|
||||||
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
|
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
@ -408,7 +408,7 @@ int dnode_hold_impl(struct objset *dd, uint64_t object, int flag, int dn_slots,
|
||||||
void *ref, dnode_t **dnp);
|
void *ref, dnode_t **dnp);
|
||||||
boolean_t dnode_add_ref(dnode_t *dn, void *ref);
|
boolean_t dnode_add_ref(dnode_t *dn, void *ref);
|
||||||
void dnode_rele(dnode_t *dn, void *ref);
|
void dnode_rele(dnode_t *dn, void *ref);
|
||||||
void dnode_rele_and_unlock(dnode_t *dn, void *tag);
|
void dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting);
|
||||||
void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx);
|
void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx);
|
||||||
void dnode_sync(dnode_t *dn, dmu_tx_t *tx);
|
void dnode_sync(dnode_t *dn, dmu_tx_t *tx);
|
||||||
void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
|
void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
|
||||||
|
|
|
@ -155,8 +155,6 @@ static void __dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh,
|
||||||
void *tag, dmu_buf_impl_t **dbp, int depth);
|
void *tag, dmu_buf_impl_t **dbp, int depth);
|
||||||
static int __dbuf_hold_impl(struct dbuf_hold_impl_data *dh);
|
static int __dbuf_hold_impl(struct dbuf_hold_impl_data *dh);
|
||||||
|
|
||||||
uint_t zfs_dbuf_evict_key;
|
|
||||||
|
|
||||||
static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
|
static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
|
||||||
static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
|
static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
|
||||||
|
|
||||||
|
@ -596,14 +594,6 @@ dbuf_evict_one(void)
|
||||||
|
|
||||||
ASSERT(!MUTEX_HELD(&dbuf_evict_lock));
|
ASSERT(!MUTEX_HELD(&dbuf_evict_lock));
|
||||||
|
|
||||||
/*
|
|
||||||
* Set the thread's tsd to indicate that it's processing evictions.
|
|
||||||
* Once a thread stops evicting from the dbuf cache it will
|
|
||||||
* reset its tsd to NULL.
|
|
||||||
*/
|
|
||||||
ASSERT3P(tsd_get(zfs_dbuf_evict_key), ==, NULL);
|
|
||||||
(void) tsd_set(zfs_dbuf_evict_key, (void *)B_TRUE);
|
|
||||||
|
|
||||||
dmu_buf_impl_t *db = multilist_sublist_tail(mls);
|
dmu_buf_impl_t *db = multilist_sublist_tail(mls);
|
||||||
while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) {
|
while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) {
|
||||||
db = multilist_sublist_prev(mls, db);
|
db = multilist_sublist_prev(mls, db);
|
||||||
|
@ -628,7 +618,6 @@ dbuf_evict_one(void)
|
||||||
} else {
|
} else {
|
||||||
multilist_sublist_unlock(mls);
|
multilist_sublist_unlock(mls);
|
||||||
}
|
}
|
||||||
(void) tsd_set(zfs_dbuf_evict_key, NULL);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -682,29 +671,6 @@ dbuf_evict_thread(void *unused)
|
||||||
static void
|
static void
|
||||||
dbuf_evict_notify(void)
|
dbuf_evict_notify(void)
|
||||||
{
|
{
|
||||||
|
|
||||||
/*
|
|
||||||
* We use thread specific data to track when a thread has
|
|
||||||
* started processing evictions. This allows us to avoid deeply
|
|
||||||
* nested stacks that would have a call flow similar to this:
|
|
||||||
*
|
|
||||||
* dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify()
|
|
||||||
* ^ |
|
|
||||||
* | |
|
|
||||||
* +-----dbuf_destroy()<--dbuf_evict_one()<--------+
|
|
||||||
*
|
|
||||||
* The dbuf_eviction_thread will always have its tsd set until
|
|
||||||
* that thread exits. All other threads will only set their tsd
|
|
||||||
* if they are participating in the eviction process. This only
|
|
||||||
* happens if the eviction thread is unable to process evictions
|
|
||||||
* fast enough. To keep the dbuf cache size in check, other threads
|
|
||||||
* can evict from the dbuf cache directly. Those threads will set
|
|
||||||
* their tsd values so that we ensure that they only evict one dbuf
|
|
||||||
* from the dbuf cache.
|
|
||||||
*/
|
|
||||||
if (tsd_get(zfs_dbuf_evict_key) != NULL)
|
|
||||||
return;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* We check if we should evict without holding the dbuf_evict_lock,
|
* We check if we should evict without holding the dbuf_evict_lock,
|
||||||
* because it's OK to occasionally make the wrong decision here,
|
* because it's OK to occasionally make the wrong decision here,
|
||||||
|
@ -801,7 +767,6 @@ retry:
|
||||||
dbuf_cache_multilist_index_func);
|
dbuf_cache_multilist_index_func);
|
||||||
refcount_create(&dbuf_cache_size);
|
refcount_create(&dbuf_cache_size);
|
||||||
|
|
||||||
tsd_create(&zfs_dbuf_evict_key, NULL);
|
|
||||||
dbuf_evict_thread_exit = B_FALSE;
|
dbuf_evict_thread_exit = B_FALSE;
|
||||||
mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL);
|
mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||||
cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL);
|
cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL);
|
||||||
|
@ -858,7 +823,6 @@ dbuf_fini(void)
|
||||||
cv_wait(&dbuf_evict_cv, &dbuf_evict_lock);
|
cv_wait(&dbuf_evict_cv, &dbuf_evict_lock);
|
||||||
}
|
}
|
||||||
mutex_exit(&dbuf_evict_lock);
|
mutex_exit(&dbuf_evict_lock);
|
||||||
tsd_destroy(&zfs_dbuf_evict_key);
|
|
||||||
|
|
||||||
mutex_destroy(&dbuf_evict_lock);
|
mutex_destroy(&dbuf_evict_lock);
|
||||||
cv_destroy(&dbuf_evict_cv);
|
cv_destroy(&dbuf_evict_cv);
|
||||||
|
@ -1152,7 +1116,7 @@ dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
|
||||||
db->db_state = DB_UNCACHED;
|
db->db_state = DB_UNCACHED;
|
||||||
}
|
}
|
||||||
cv_broadcast(&db->db_changed);
|
cv_broadcast(&db->db_changed);
|
||||||
dbuf_rele_and_unlock(db, NULL);
|
dbuf_rele_and_unlock(db, NULL, B_FALSE);
|
||||||
}
|
}
|
||||||
|
|
||||||
static int
|
static int
|
||||||
|
@ -2441,7 +2405,8 @@ dbuf_destroy(dmu_buf_impl_t *db)
|
||||||
* value in dnode_move(), since DB_DNODE_EXIT doesn't actually
|
* value in dnode_move(), since DB_DNODE_EXIT doesn't actually
|
||||||
* release any lock.
|
* release any lock.
|
||||||
*/
|
*/
|
||||||
dnode_rele(dn, db);
|
mutex_enter(&dn->dn_mtx);
|
||||||
|
dnode_rele_and_unlock(dn, db, B_TRUE);
|
||||||
db->db_dnode_handle = NULL;
|
db->db_dnode_handle = NULL;
|
||||||
|
|
||||||
dbuf_hash_remove(db);
|
dbuf_hash_remove(db);
|
||||||
|
@ -2467,8 +2432,10 @@ dbuf_destroy(dmu_buf_impl_t *db)
|
||||||
* If this dbuf is referenced from an indirect dbuf,
|
* If this dbuf is referenced from an indirect dbuf,
|
||||||
* decrement the ref count on the indirect dbuf.
|
* decrement the ref count on the indirect dbuf.
|
||||||
*/
|
*/
|
||||||
if (parent && parent != dndb)
|
if (parent && parent != dndb) {
|
||||||
dbuf_rele(parent, db);
|
mutex_enter(&parent->db_mtx);
|
||||||
|
dbuf_rele_and_unlock(parent, db, B_TRUE);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -3195,7 +3162,7 @@ void
|
||||||
dbuf_rele(dmu_buf_impl_t *db, void *tag)
|
dbuf_rele(dmu_buf_impl_t *db, void *tag)
|
||||||
{
|
{
|
||||||
mutex_enter(&db->db_mtx);
|
mutex_enter(&db->db_mtx);
|
||||||
dbuf_rele_and_unlock(db, tag);
|
dbuf_rele_and_unlock(db, tag, B_FALSE);
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
|
@ -3206,10 +3173,19 @@ dmu_buf_rele(dmu_buf_t *db, void *tag)
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* dbuf_rele() for an already-locked dbuf. This is necessary to allow
|
* dbuf_rele() for an already-locked dbuf. This is necessary to allow
|
||||||
* db_dirtycnt and db_holds to be updated atomically.
|
* db_dirtycnt and db_holds to be updated atomically. The 'evicting'
|
||||||
|
* argument should be set if we are already in the dbuf-evicting code
|
||||||
|
* path, in which case we don't want to recursively evict. This allows us to
|
||||||
|
* avoid deeply nested stacks that would have a call flow similar to this:
|
||||||
|
*
|
||||||
|
* dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify()
|
||||||
|
* ^ |
|
||||||
|
* | |
|
||||||
|
* +-----dbuf_destroy()<--dbuf_evict_one()<--------+
|
||||||
|
*
|
||||||
*/
|
*/
|
||||||
void
|
void
|
||||||
dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
|
dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting)
|
||||||
{
|
{
|
||||||
int64_t holds;
|
int64_t holds;
|
||||||
|
|
||||||
|
@ -3310,6 +3286,7 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
|
||||||
refcount_count(&dbuf_cache_size));
|
refcount_count(&dbuf_cache_size));
|
||||||
mutex_exit(&db->db_mtx);
|
mutex_exit(&db->db_mtx);
|
||||||
|
|
||||||
|
if (!evicting)
|
||||||
dbuf_evict_notify();
|
dbuf_evict_notify();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3647,7 +3624,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
|
||||||
kmem_free(dr, sizeof (dbuf_dirty_record_t));
|
kmem_free(dr, sizeof (dbuf_dirty_record_t));
|
||||||
ASSERT(db->db_dirtycnt > 0);
|
ASSERT(db->db_dirtycnt > 0);
|
||||||
db->db_dirtycnt -= 1;
|
db->db_dirtycnt -= 1;
|
||||||
dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
|
dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg, B_FALSE);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -4020,7 +3997,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
|
||||||
ASSERT(db->db_dirtycnt > 0);
|
ASSERT(db->db_dirtycnt > 0);
|
||||||
db->db_dirtycnt -= 1;
|
db->db_dirtycnt -= 1;
|
||||||
db->db_data_pending = NULL;
|
db->db_data_pending = NULL;
|
||||||
dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg);
|
dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
|
|
|
@ -1571,11 +1571,11 @@ void
|
||||||
dnode_rele(dnode_t *dn, void *tag)
|
dnode_rele(dnode_t *dn, void *tag)
|
||||||
{
|
{
|
||||||
mutex_enter(&dn->dn_mtx);
|
mutex_enter(&dn->dn_mtx);
|
||||||
dnode_rele_and_unlock(dn, tag);
|
dnode_rele_and_unlock(dn, tag, B_FALSE);
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
dnode_rele_and_unlock(dnode_t *dn, void *tag)
|
dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting)
|
||||||
{
|
{
|
||||||
uint64_t refs;
|
uint64_t refs;
|
||||||
/* Get while the hold prevents the dnode from moving. */
|
/* Get while the hold prevents the dnode from moving. */
|
||||||
|
@ -1606,7 +1606,8 @@ dnode_rele_and_unlock(dnode_t *dn, void *tag)
|
||||||
* that the handle has zero references, but that will be
|
* that the handle has zero references, but that will be
|
||||||
* asserted anyway when the handle gets destroyed.
|
* asserted anyway when the handle gets destroyed.
|
||||||
*/
|
*/
|
||||||
dbuf_rele(db, dnh);
|
mutex_enter(&db->db_mtx);
|
||||||
|
dbuf_rele_and_unlock(db, dnh, evicting);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -21,7 +21,7 @@
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||||
* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
|
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
|
||||||
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
|
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
@ -424,6 +424,19 @@ dnode_evict_dbufs(dnode_t *dn)
|
||||||
avl_insert_here(&dn->dn_dbufs, db_marker, db,
|
avl_insert_here(&dn->dn_dbufs, db_marker, db,
|
||||||
AVL_BEFORE);
|
AVL_BEFORE);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We need to use the "marker" dbuf rather than
|
||||||
|
* simply getting the next dbuf, because
|
||||||
|
* dbuf_destroy() may actually remove multiple dbufs.
|
||||||
|
* It can call itself recursively on the parent dbuf,
|
||||||
|
* which may also be removed from dn_dbufs. The code
|
||||||
|
* flow would look like:
|
||||||
|
*
|
||||||
|
* dbuf_destroy():
|
||||||
|
* dnode_rele_and_unlock(parent_dbuf, evicting=TRUE):
|
||||||
|
* if (!cacheable || pending_evict)
|
||||||
|
* dbuf_destroy()
|
||||||
|
*/
|
||||||
dbuf_destroy(db);
|
dbuf_destroy(db);
|
||||||
|
|
||||||
db_next = AVL_NEXT(&dn->dn_dbufs, db_marker);
|
db_next = AVL_NEXT(&dn->dn_dbufs, db_marker);
|
||||||
|
@ -484,7 +497,7 @@ dnode_undirty_dbufs(list_t *list)
|
||||||
list_destroy(&dr->dt.di.dr_children);
|
list_destroy(&dr->dt.di.dr_children);
|
||||||
}
|
}
|
||||||
kmem_free(dr, sizeof (dbuf_dirty_record_t));
|
kmem_free(dr, sizeof (dbuf_dirty_record_t));
|
||||||
dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
|
dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg, B_FALSE);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue