Illumos 5008 - lock contention (rrw_exit) while running a read only load

5008 lock contention (rrw_exit) while running a read only load
Reviewed by: Matthew Ahrens <matthew.ahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Alex Reece <alex.reece@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Richard Yao <ryao@gentoo.org>
Reviewed by: Saso Kiselkov <skiselkov.ml@gmail.com>
Approved by: Garrett D'Amore <garrett@damore.org>

Porting notes:

This patch ported perfectly cleanly to ZoL.  During testing 100% cached
small-block reads, extreme contention was noticed on rrl->rr_lock from
rrw_exit() due to the frequent entering and leaving ZPL.  Illumos picked
up this patch from FreeBSD and it also helps under Linux.

On a 1-minute 4K cached read test with 10 fio processes pinned to a single
socket on a 4-socket (10 thread per socket) NUMA system, contentions on
rrl->rr_lock were reduced from 508799 to 43085.

Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3555
This commit is contained in:
Alexander Motin 2014-07-18 08:53:38 -08:00 committed by Brian Behlendorf
parent 4bda3bd0e7
commit e16b3fcc61
6 changed files with 126 additions and 13 deletions

View File

@ -83,6 +83,31 @@ void rrw_tsd_destroy(void *arg);
#define RRW_LOCK_HELD(x) \ #define RRW_LOCK_HELD(x) \
(rrw_held(x, RW_WRITER) || rrw_held(x, RW_READER)) (rrw_held(x, RW_WRITER) || rrw_held(x, RW_READER))
/*
* A reader-mostly lock implementation, tuning above reader-writer locks
* for hightly parallel read acquisitions, pessimizing write acquisitions.
*
* This should be a prime number. See comment in rrwlock.c near
* RRM_TD_LOCK() for details.
*/
#define RRM_NUM_LOCKS 17
typedef struct rrmlock {
rrwlock_t locks[RRM_NUM_LOCKS];
} rrmlock_t;
void rrm_init(rrmlock_t *rrl, boolean_t track_all);
void rrm_destroy(rrmlock_t *rrl);
void rrm_enter(rrmlock_t *rrl, krw_t rw, void *tag);
void rrm_enter_read(rrmlock_t *rrl, void *tag);
void rrm_enter_write(rrmlock_t *rrl);
void rrm_exit(rrmlock_t *rrl, void *tag);
boolean_t rrm_held(rrmlock_t *rrl, krw_t rw);
#define RRM_READ_HELD(x) rrm_held(x, RW_READER)
#define RRM_WRITE_HELD(x) rrm_held(x, RW_WRITER)
#define RRM_LOCK_HELD(x) \
(rrm_held(x, RW_WRITER) || rrm_held(x, RW_READER))
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

View File

@ -67,7 +67,7 @@ typedef struct zfs_sb {
boolean_t z_atime; /* enable atimes mount option */ boolean_t z_atime; /* enable atimes mount option */
boolean_t z_relatime; /* enable relatime mount option */ boolean_t z_relatime; /* enable relatime mount option */
boolean_t z_unmounted; /* unmounted */ boolean_t z_unmounted; /* unmounted */
rrwlock_t z_teardown_lock; rrmlock_t z_teardown_lock;
krwlock_t z_teardown_inactive_lock; krwlock_t z_teardown_inactive_lock;
list_t z_all_znodes; /* all znodes in the fs */ list_t z_all_znodes; /* all znodes in the fs */
uint64_t z_nr_znodes; /* number of znodes in the fs */ uint64_t z_nr_znodes; /* number of znodes in the fs */

View File

@ -250,7 +250,7 @@ typedef struct znode {
/* Called on entry to each ZFS vnode and vfs operation */ /* Called on entry to each ZFS vnode and vfs operation */
#define ZFS_ENTER(zsb) \ #define ZFS_ENTER(zsb) \
{ \ { \
rrw_enter_read(&(zsb)->z_teardown_lock, FTAG); \ rrm_enter_read(&(zsb)->z_teardown_lock, FTAG); \
if ((zsb)->z_unmounted) { \ if ((zsb)->z_unmounted) { \
ZFS_EXIT(zsb); \ ZFS_EXIT(zsb); \
return (EIO); \ return (EIO); \
@ -260,7 +260,7 @@ typedef struct znode {
/* Must be called before exiting the vop */ /* Must be called before exiting the vop */
#define ZFS_EXIT(zsb) \ #define ZFS_EXIT(zsb) \
{ \ { \
rrw_exit(&(zsb)->z_teardown_lock, FTAG); \ rrm_exit(&(zsb)->z_teardown_lock, FTAG); \
} }
/* Verifies the znode is valid */ /* Verifies the znode is valid */

View File

@ -305,3 +305,91 @@ rrw_tsd_destroy(void *arg)
(void *)curthread, (void *)rn->rn_rrl); (void *)curthread, (void *)rn->rn_rrl);
} }
} }
/*
* A reader-mostly lock implementation, tuning above reader-writer locks
* for hightly parallel read acquisitions, while pessimizing writes.
*
* The idea is to split single busy lock into array of locks, so that
* each reader can lock only one of them for read, depending on result
* of simple hash function. That proportionally reduces lock congestion.
* Writer same time has to sequentially aquire write on all the locks.
* That makes write aquisition proportionally slower, but in places where
* it is used (filesystem unmount) performance is not critical.
*
* All the functions below are direct wrappers around functions above.
*/
void
rrm_init(rrmlock_t *rrl, boolean_t track_all)
{
int i;
for (i = 0; i < RRM_NUM_LOCKS; i++)
rrw_init(&rrl->locks[i], track_all);
}
void
rrm_destroy(rrmlock_t *rrl)
{
int i;
for (i = 0; i < RRM_NUM_LOCKS; i++)
rrw_destroy(&rrl->locks[i]);
}
void
rrm_enter(rrmlock_t *rrl, krw_t rw, void *tag)
{
if (rw == RW_READER)
rrm_enter_read(rrl, tag);
else
rrm_enter_write(rrl);
}
/*
* This maps the current thread to a specific lock. Note that the lock
* must be released by the same thread that acquired it. We do this
* mapping by taking the thread pointer mod a prime number. We examine
* only the low 32 bits of the thread pointer, because 32-bit division
* is faster than 64-bit division, and the high 32 bits have little
* entropy anyway.
*/
#define RRM_TD_LOCK() (((uint32_t)(uintptr_t)(curthread)) % RRM_NUM_LOCKS)
void
rrm_enter_read(rrmlock_t *rrl, void *tag)
{
rrw_enter_read(&rrl->locks[RRM_TD_LOCK()], tag);
}
void
rrm_enter_write(rrmlock_t *rrl)
{
int i;
for (i = 0; i < RRM_NUM_LOCKS; i++)
rrw_enter_write(&rrl->locks[i]);
}
void
rrm_exit(rrmlock_t *rrl, void *tag)
{
int i;
if (rrl->locks[0].rr_writer == curthread) {
for (i = 0; i < RRM_NUM_LOCKS; i++)
rrw_exit(&rrl->locks[i], tag);
} else {
rrw_exit(&rrl->locks[RRM_TD_LOCK()], tag);
}
}
boolean_t
rrm_held(rrmlock_t *rrl, krw_t rw)
{
if (rw == RW_WRITER) {
return (rrw_held(&rrl->locks[0], rw));
} else {
return (rrw_held(&rrl->locks[RRM_TD_LOCK()], rw));
}
}

View File

@ -1451,7 +1451,7 @@ zfs_sb_hold(const char *name, void *tag, zfs_sb_t **zsbp, boolean_t writer)
if (get_zfs_sb(name, zsbp) != 0) if (get_zfs_sb(name, zsbp) != 0)
error = zfs_sb_create(name, zsbp); error = zfs_sb_create(name, zsbp);
if (error == 0) { if (error == 0) {
rrw_enter(&(*zsbp)->z_teardown_lock, (writer) ? RW_WRITER : rrm_enter(&(*zsbp)->z_teardown_lock, (writer) ? RW_WRITER :
RW_READER, tag); RW_READER, tag);
if ((*zsbp)->z_unmounted) { if ((*zsbp)->z_unmounted) {
/* /*
@ -1459,7 +1459,7 @@ zfs_sb_hold(const char *name, void *tag, zfs_sb_t **zsbp, boolean_t writer)
* thread should be just about to disassociate the * thread should be just about to disassociate the
* objset from the zsb. * objset from the zsb.
*/ */
rrw_exit(&(*zsbp)->z_teardown_lock, tag); rrm_exit(&(*zsbp)->z_teardown_lock, tag);
return (SET_ERROR(EBUSY)); return (SET_ERROR(EBUSY));
} }
} }
@ -1469,7 +1469,7 @@ zfs_sb_hold(const char *name, void *tag, zfs_sb_t **zsbp, boolean_t writer)
static void static void
zfs_sb_rele(zfs_sb_t *zsb, void *tag) zfs_sb_rele(zfs_sb_t *zsb, void *tag)
{ {
rrw_exit(&zsb->z_teardown_lock, tag); rrm_exit(&zsb->z_teardown_lock, tag);
if (zsb->z_sb) { if (zsb->z_sb) {
deactivate_super(zsb->z_sb); deactivate_super(zsb->z_sb);

View File

@ -771,7 +771,7 @@ zfs_sb_create(const char *osname, zfs_sb_t **zsbp)
mutex_init(&zsb->z_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zsb->z_lock, NULL, MUTEX_DEFAULT, NULL);
list_create(&zsb->z_all_znodes, sizeof (znode_t), list_create(&zsb->z_all_znodes, sizeof (znode_t),
offsetof(znode_t, z_link_node)); offsetof(znode_t, z_link_node));
rrw_init(&zsb->z_teardown_lock, B_FALSE); rrm_init(&zsb->z_teardown_lock, B_FALSE);
rw_init(&zsb->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL); rw_init(&zsb->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
rw_init(&zsb->z_fuid_lock, NULL, RW_DEFAULT, NULL); rw_init(&zsb->z_fuid_lock, NULL, RW_DEFAULT, NULL);
@ -890,7 +890,7 @@ zfs_sb_free(zfs_sb_t *zsb)
mutex_destroy(&zsb->z_znodes_lock); mutex_destroy(&zsb->z_znodes_lock);
mutex_destroy(&zsb->z_lock); mutex_destroy(&zsb->z_lock);
list_destroy(&zsb->z_all_znodes); list_destroy(&zsb->z_all_znodes);
rrw_destroy(&zsb->z_teardown_lock); rrm_destroy(&zsb->z_teardown_lock);
rw_destroy(&zsb->z_teardown_inactive_lock); rw_destroy(&zsb->z_teardown_inactive_lock);
rw_destroy(&zsb->z_fuid_lock); rw_destroy(&zsb->z_fuid_lock);
for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
@ -1221,7 +1221,7 @@ zfs_sb_teardown(zfs_sb_t *zsb, boolean_t unmounting)
} }
} }
rrw_enter(&zsb->z_teardown_lock, RW_WRITER, FTAG); rrm_enter(&zsb->z_teardown_lock, RW_WRITER, FTAG);
if (!unmounting) { if (!unmounting) {
/* /*
@ -1252,7 +1252,7 @@ zfs_sb_teardown(zfs_sb_t *zsb, boolean_t unmounting)
*/ */
if (!unmounting && (zsb->z_unmounted || zsb->z_os == NULL)) { if (!unmounting && (zsb->z_unmounted || zsb->z_os == NULL)) {
rw_exit(&zsb->z_teardown_inactive_lock); rw_exit(&zsb->z_teardown_inactive_lock);
rrw_exit(&zsb->z_teardown_lock, FTAG); rrm_exit(&zsb->z_teardown_lock, FTAG);
return (SET_ERROR(EIO)); return (SET_ERROR(EIO));
} }
@ -1280,7 +1280,7 @@ zfs_sb_teardown(zfs_sb_t *zsb, boolean_t unmounting)
*/ */
if (unmounting) { if (unmounting) {
zsb->z_unmounted = B_TRUE; zsb->z_unmounted = B_TRUE;
rrw_exit(&zsb->z_teardown_lock, FTAG); rrm_exit(&zsb->z_teardown_lock, FTAG);
rw_exit(&zsb->z_teardown_inactive_lock); rw_exit(&zsb->z_teardown_inactive_lock);
} }
@ -1599,7 +1599,7 @@ zfs_resume_fs(zfs_sb_t *zsb, const char *osname)
znode_t *zp; znode_t *zp;
uint64_t sa_obj = 0; uint64_t sa_obj = 0;
ASSERT(RRW_WRITE_HELD(&zsb->z_teardown_lock)); ASSERT(RRM_WRITE_HELD(&zsb->z_teardown_lock));
ASSERT(RW_WRITE_HELD(&zsb->z_teardown_inactive_lock)); ASSERT(RW_WRITE_HELD(&zsb->z_teardown_inactive_lock));
/* /*
@ -1663,7 +1663,7 @@ zfs_resume_fs(zfs_sb_t *zsb, const char *osname)
bail: bail:
/* release the VFS ops */ /* release the VFS ops */
rw_exit(&zsb->z_teardown_inactive_lock); rw_exit(&zsb->z_teardown_inactive_lock);
rrw_exit(&zsb->z_teardown_lock, FTAG); rrm_exit(&zsb->z_teardown_lock, FTAG);
if (err) { if (err) {
/* /*