Allow for lock-free reading zfsdev_state_list.
Restructure the zfsdev_state_list to allow for lock-free reading by converting to a simple singly-linked list from which items are never deleted and over which only forward iterations are performed. It depends on, among other things, the atomicity of accessing the zs_minor integer and zs_next pointer. This fixes a lock inversion in which the zfsdev_state_lock is used by both the sync task (txg_sync) and indirectly by any user program which uses /dev/zfs; the zfsdev_release method uses the same lock and then blocks on the sync task. The most typical failure scenerio occurs when the sync task is cleaning up a user hold while various concurrent "zfs" commands are in progress. Neither Illumos nor Solaris are affected by this issue because they use DDI interface which provides lock-free reading of device state via the ddi_get_soft_state() function. Signed-off-by: Tim Chase <tim@chase2k.com> Signed-off-by: Chunwei Chen <tuxoko@gmail.com> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #2301
This commit is contained in:
parent
1cbae971c5
commit
3937ab20f3
|
@ -371,8 +371,15 @@ enum zfsdev_state_type {
|
||||||
ZST_ALL,
|
ZST_ALL,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The zfsdev_state_t structure is managed as a singly-linked list
|
||||||
|
* from which items are never deleted. This allows for lock-free
|
||||||
|
* reading of the list so long as assignments to the zs_next and
|
||||||
|
* reads from zs_minor are performed atomically. Empty items are
|
||||||
|
* indicated by storing -1 into zs_minor.
|
||||||
|
*/
|
||||||
typedef struct zfsdev_state {
|
typedef struct zfsdev_state {
|
||||||
list_node_t zs_next; /* next zfsdev_state_t link */
|
struct zfsdev_state *zs_next; /* next zfsdev_state_t link */
|
||||||
struct file *zs_file; /* associated file struct */
|
struct file *zs_file; /* associated file struct */
|
||||||
minor_t zs_minor; /* made up minor number */
|
minor_t zs_minor; /* made up minor number */
|
||||||
void *zs_onexit; /* onexit data */
|
void *zs_onexit; /* onexit data */
|
||||||
|
|
|
@ -191,7 +191,7 @@
|
||||||
#include "zfs_comutil.h"
|
#include "zfs_comutil.h"
|
||||||
|
|
||||||
kmutex_t zfsdev_state_lock;
|
kmutex_t zfsdev_state_lock;
|
||||||
list_t zfsdev_state_list;
|
zfsdev_state_t *zfsdev_state_list;
|
||||||
|
|
||||||
extern void zfs_init(void);
|
extern void zfs_init(void);
|
||||||
extern void zfs_fini(void);
|
extern void zfs_fini(void);
|
||||||
|
@ -5447,11 +5447,9 @@ zfsdev_get_state_impl(minor_t minor, enum zfsdev_state_type which)
|
||||||
{
|
{
|
||||||
zfsdev_state_t *zs;
|
zfsdev_state_t *zs;
|
||||||
|
|
||||||
ASSERT(MUTEX_HELD(&zfsdev_state_lock));
|
for (zs = zfsdev_state_list; zs != NULL; zs = zs->zs_next) {
|
||||||
|
|
||||||
for (zs = list_head(&zfsdev_state_list); zs != NULL;
|
|
||||||
zs = list_next(&zfsdev_state_list, zs)) {
|
|
||||||
if (zs->zs_minor == minor) {
|
if (zs->zs_minor == minor) {
|
||||||
|
smp_rmb();
|
||||||
switch (which) {
|
switch (which) {
|
||||||
case ZST_ONEXIT:
|
case ZST_ONEXIT:
|
||||||
return (zs->zs_onexit);
|
return (zs->zs_onexit);
|
||||||
|
@ -5471,9 +5469,7 @@ zfsdev_get_state(minor_t minor, enum zfsdev_state_type which)
|
||||||
{
|
{
|
||||||
void *ptr;
|
void *ptr;
|
||||||
|
|
||||||
mutex_enter(&zfsdev_state_lock);
|
|
||||||
ptr = zfsdev_get_state_impl(minor, which);
|
ptr = zfsdev_get_state_impl(minor, which);
|
||||||
mutex_exit(&zfsdev_state_lock);
|
|
||||||
|
|
||||||
return (ptr);
|
return (ptr);
|
||||||
}
|
}
|
||||||
|
@ -5514,8 +5510,9 @@ zfsdev_minor_alloc(void)
|
||||||
static int
|
static int
|
||||||
zfsdev_state_init(struct file *filp)
|
zfsdev_state_init(struct file *filp)
|
||||||
{
|
{
|
||||||
zfsdev_state_t *zs;
|
zfsdev_state_t *zs, *zsprev = NULL;
|
||||||
minor_t minor;
|
minor_t minor;
|
||||||
|
boolean_t newzs = B_FALSE;
|
||||||
|
|
||||||
ASSERT(MUTEX_HELD(&zfsdev_state_lock));
|
ASSERT(MUTEX_HELD(&zfsdev_state_lock));
|
||||||
|
|
||||||
|
@ -5523,16 +5520,40 @@ zfsdev_state_init(struct file *filp)
|
||||||
if (minor == 0)
|
if (minor == 0)
|
||||||
return (SET_ERROR(ENXIO));
|
return (SET_ERROR(ENXIO));
|
||||||
|
|
||||||
zs = kmem_zalloc(sizeof (zfsdev_state_t), KM_SLEEP);
|
for (zs = zfsdev_state_list; zs != NULL; zs = zs->zs_next) {
|
||||||
|
if (zs->zs_minor == -1)
|
||||||
|
break;
|
||||||
|
zsprev = zs;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!zs) {
|
||||||
|
zs = kmem_zalloc(sizeof (zfsdev_state_t), KM_SLEEP);
|
||||||
|
newzs = B_TRUE;
|
||||||
|
}
|
||||||
|
|
||||||
zs->zs_file = filp;
|
zs->zs_file = filp;
|
||||||
zs->zs_minor = minor;
|
|
||||||
filp->private_data = zs;
|
filp->private_data = zs;
|
||||||
|
|
||||||
zfs_onexit_init((zfs_onexit_t **)&zs->zs_onexit);
|
zfs_onexit_init((zfs_onexit_t **)&zs->zs_onexit);
|
||||||
zfs_zevent_init((zfs_zevent_t **)&zs->zs_zevent);
|
zfs_zevent_init((zfs_zevent_t **)&zs->zs_zevent);
|
||||||
|
|
||||||
list_insert_tail(&zfsdev_state_list, zs);
|
|
||||||
|
/*
|
||||||
|
* In order to provide for lock-free concurrent read access
|
||||||
|
* to the minor list in zfsdev_get_state_impl(), new entries
|
||||||
|
* must be completely written before linking them into the
|
||||||
|
* list whereas existing entries are already linked; the last
|
||||||
|
* operation must be updating zs_minor (from -1 to the new
|
||||||
|
* value).
|
||||||
|
*/
|
||||||
|
if (newzs) {
|
||||||
|
zs->zs_minor = minor;
|
||||||
|
smp_wmb();
|
||||||
|
zsprev->zs_next = zs;
|
||||||
|
} else {
|
||||||
|
smp_wmb();
|
||||||
|
zs->zs_minor = minor;
|
||||||
|
}
|
||||||
|
|
||||||
return (0);
|
return (0);
|
||||||
}
|
}
|
||||||
|
@ -5546,12 +5567,10 @@ zfsdev_state_destroy(struct file *filp)
|
||||||
ASSERT(filp->private_data != NULL);
|
ASSERT(filp->private_data != NULL);
|
||||||
|
|
||||||
zs = filp->private_data;
|
zs = filp->private_data;
|
||||||
|
zs->zs_minor = -1;
|
||||||
zfs_onexit_destroy(zs->zs_onexit);
|
zfs_onexit_destroy(zs->zs_onexit);
|
||||||
zfs_zevent_destroy(zs->zs_zevent);
|
zfs_zevent_destroy(zs->zs_zevent);
|
||||||
|
|
||||||
list_remove(&zfsdev_state_list, zs);
|
|
||||||
kmem_free(zs, sizeof (zfsdev_state_t));
|
|
||||||
|
|
||||||
return (0);
|
return (0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -5762,8 +5781,8 @@ zfs_attach(void)
|
||||||
int error;
|
int error;
|
||||||
|
|
||||||
mutex_init(&zfsdev_state_lock, NULL, MUTEX_DEFAULT, NULL);
|
mutex_init(&zfsdev_state_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||||
list_create(&zfsdev_state_list, sizeof (zfsdev_state_t),
|
zfsdev_state_list = kmem_zalloc(sizeof (zfsdev_state_t), KM_SLEEP);
|
||||||
offsetof(zfsdev_state_t, zs_next));
|
zfsdev_state_list->zs_minor = -1;
|
||||||
|
|
||||||
error = misc_register(&zfs_misc);
|
error = misc_register(&zfs_misc);
|
||||||
if (error != 0) {
|
if (error != 0) {
|
||||||
|
@ -5778,13 +5797,21 @@ static void
|
||||||
zfs_detach(void)
|
zfs_detach(void)
|
||||||
{
|
{
|
||||||
int error;
|
int error;
|
||||||
|
zfsdev_state_t *zs, *zsprev = NULL;
|
||||||
|
|
||||||
error = misc_deregister(&zfs_misc);
|
error = misc_deregister(&zfs_misc);
|
||||||
if (error != 0)
|
if (error != 0)
|
||||||
printk(KERN_INFO "ZFS: misc_deregister() failed %d\n", error);
|
printk(KERN_INFO "ZFS: misc_deregister() failed %d\n", error);
|
||||||
|
|
||||||
mutex_destroy(&zfsdev_state_lock);
|
mutex_destroy(&zfsdev_state_lock);
|
||||||
list_destroy(&zfsdev_state_list);
|
|
||||||
|
for (zs = zfsdev_state_list; zs != NULL; zs = zs->zs_next) {
|
||||||
|
if (zsprev)
|
||||||
|
kmem_free(zsprev, sizeof (zfsdev_state_t));
|
||||||
|
zsprev = zs;
|
||||||
|
}
|
||||||
|
if (zsprev)
|
||||||
|
kmem_free(zsprev, sizeof (zfsdev_state_t));
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
|
|
Loading…
Reference in New Issue