Fix synchronous behavior in __vdev_disk_physio()

Commit b39c22b set the READ_SYNC and WRITE_SYNC flags for a bio
based on the ZIO_PRIORITY_* flag passed in.  This had the unnoticed
side-effect of making the vdev_disk_io_start() synchronous for
certain I/Os.

This in turn resulted in vdev_disk_io_start() being able to
re-dispatch zio's which would result in a RCU stalls when a disk
was removed from the system.  Additionally, this could negatively
impact performance and explains the performance regressions reported
in both #3829 and #3780.

This patch resolves the issue by making the blocking behavior
dependent on a 'wait' flag being passed rather than overloading
the passed bio flags.

Finally, the WRITE_SYNC and READ_SYNC behavior is restricted to
non-rotational devices where there is no benefit to queuing to
aggregate the I/O.

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #3652
Issue #3780
Issue #3785
Issue #3817
Issue #3821
Issue #3829
Issue #3832
Issue #3870
This commit is contained in:
Brian Behlendorf 2015-09-24 16:32:25 -07:00
parent ef5b2e1048
commit 5592404784
3 changed files with 8 additions and 81 deletions

View File

@ -1,50 +0,0 @@
dnl #
dnl # Preferred interface for flagging a synchronous bio:
dnl # 2.6.12-2.6.29: BIO_RW_SYNC
dnl # 2.6.30-2.6.35: BIO_RW_SYNCIO
dnl # 2.6.36-2.6.xx: REQ_SYNC
dnl #
AC_DEFUN([ZFS_AC_KERNEL_BIO_RW_SYNC], [
AC_MSG_CHECKING([whether BIO_RW_SYNC is defined])
ZFS_LINUX_TRY_COMPILE([
#include <linux/bio.h>
],[
int flags __attribute__ ((unused));
flags = BIO_RW_SYNC;
],[
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_BIO_RW_SYNC, 1, [BIO_RW_SYNC is defined])
],[
AC_MSG_RESULT(no)
])
])
AC_DEFUN([ZFS_AC_KERNEL_BIO_RW_SYNCIO], [
AC_MSG_CHECKING([whether BIO_RW_SYNCIO is defined])
ZFS_LINUX_TRY_COMPILE([
#include <linux/bio.h>
],[
int flags __attribute__ ((unused));
flags = BIO_RW_SYNCIO;
],[
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_BIO_RW_SYNCIO, 1, [BIO_RW_SYNCIO is defined])
],[
AC_MSG_RESULT(no)
])
])
AC_DEFUN([ZFS_AC_KERNEL_REQ_SYNC], [
AC_MSG_CHECKING([whether REQ_SYNC is defined])
ZFS_LINUX_TRY_COMPILE([
#include <linux/bio.h>
],[
int flags __attribute__ ((unused));
flags = REQ_SYNC;
],[
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_REQ_SYNC, 1, [REQ_SYNC is defined])
],[
AC_MSG_RESULT(no)
])
])

View File

@ -25,9 +25,6 @@ AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [
ZFS_AC_KERNEL_BIO_END_IO_T_ARGS ZFS_AC_KERNEL_BIO_END_IO_T_ARGS
ZFS_AC_KERNEL_BIO_RW_BARRIER ZFS_AC_KERNEL_BIO_RW_BARRIER
ZFS_AC_KERNEL_BIO_RW_DISCARD ZFS_AC_KERNEL_BIO_RW_DISCARD
ZFS_AC_KERNEL_BIO_RW_SYNC
ZFS_AC_KERNEL_BIO_RW_SYNCIO
ZFS_AC_KERNEL_REQ_SYNC
ZFS_AC_KERNEL_BLK_QUEUE_FLUSH ZFS_AC_KERNEL_BLK_QUEUE_FLUSH
ZFS_AC_KERNEL_BLK_QUEUE_MAX_HW_SECTORS ZFS_AC_KERNEL_BLK_QUEUE_MAX_HW_SECTORS
ZFS_AC_KERNEL_BLK_QUEUE_MAX_SEGMENTS ZFS_AC_KERNEL_BLK_QUEUE_MAX_SEGMENTS

View File

@ -369,27 +369,6 @@ vdev_disk_dio_free(dio_request_t *dr)
sizeof (struct bio *) * dr->dr_bio_count); sizeof (struct bio *) * dr->dr_bio_count);
} }
static int
vdev_disk_dio_is_sync(dio_request_t *dr)
{
#ifdef HAVE_BIO_RW_SYNC
/* BIO_RW_SYNC preferred interface from 2.6.12-2.6.29 */
return (dr->dr_rw & (1 << BIO_RW_SYNC));
#else
#ifdef HAVE_BIO_RW_SYNCIO
/* BIO_RW_SYNCIO preferred interface from 2.6.30-2.6.35 */
return (dr->dr_rw & (1 << BIO_RW_SYNCIO));
#else
#ifdef HAVE_REQ_SYNC
/* REQ_SYNC preferred interface from 2.6.36-2.6.xx */
return (dr->dr_rw & REQ_SYNC);
#else
#error "Unable to determine bio sync flag"
#endif /* HAVE_REQ_SYNC */
#endif /* HAVE_BIO_RW_SYNC */
#endif /* HAVE_BIO_RW_SYNCIO */
}
static void static void
vdev_disk_dio_get(dio_request_t *dr) vdev_disk_dio_get(dio_request_t *dr)
{ {
@ -444,7 +423,7 @@ BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error)
rc = vdev_disk_dio_put(dr); rc = vdev_disk_dio_put(dr);
/* Wake up synchronous waiter this is the last outstanding bio */ /* Wake up synchronous waiter this is the last outstanding bio */
if ((rc == 1) && vdev_disk_dio_is_sync(dr)) if (rc == 1)
complete(&dr->dr_comp); complete(&dr->dr_comp);
} }
@ -512,7 +491,7 @@ vdev_submit_bio(int rw, struct bio *bio)
static int static int
__vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr, __vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr,
size_t kbuf_size, uint64_t kbuf_offset, int flags) size_t kbuf_size, uint64_t kbuf_offset, int flags, int wait)
{ {
dio_request_t *dr; dio_request_t *dr;
caddr_t bio_ptr; caddr_t bio_ptr;
@ -603,7 +582,7 @@ retry:
* only synchronous consumer is vdev_disk_read_rootlabel() all other * only synchronous consumer is vdev_disk_read_rootlabel() all other
* IO originating from vdev_disk_io_start() is asynchronous. * IO originating from vdev_disk_io_start() is asynchronous.
*/ */
if (vdev_disk_dio_is_sync(dr)) { if (wait) {
wait_for_completion(&dr->dr_comp); wait_for_completion(&dr->dr_comp);
error = dr->dr_error; error = dr->dr_error;
ASSERT3S(atomic_read(&dr->dr_ref), ==, 1); ASSERT3S(atomic_read(&dr->dr_ref), ==, 1);
@ -619,7 +598,7 @@ vdev_disk_physio(struct block_device *bdev, caddr_t kbuf,
size_t size, uint64_t offset, int flags) size_t size, uint64_t offset, int flags)
{ {
bio_set_flags_failfast(bdev, &flags); bio_set_flags_failfast(bdev, &flags);
return (__vdev_disk_physio(bdev, NULL, kbuf, size, offset, flags)); return (__vdev_disk_physio(bdev, NULL, kbuf, size, offset, flags, 1));
} }
BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, rc) BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, rc)
@ -671,6 +650,7 @@ vdev_disk_io_start(zio_t *zio)
{ {
vdev_t *v = zio->io_vd; vdev_t *v = zio->io_vd;
vdev_disk_t *vd = v->vdev_tsd; vdev_disk_t *vd = v->vdev_tsd;
zio_priority_t pri = zio->io_priority;
int flags, error; int flags, error;
switch (zio->io_type) { switch (zio->io_type) {
@ -710,14 +690,14 @@ vdev_disk_io_start(zio_t *zio)
zio_execute(zio); zio_execute(zio);
return; return;
case ZIO_TYPE_WRITE: case ZIO_TYPE_WRITE:
if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE) if ((pri == ZIO_PRIORITY_SYNC_WRITE) && (v->vdev_nonrot))
flags = WRITE_SYNC; flags = WRITE_SYNC;
else else
flags = WRITE; flags = WRITE;
break; break;
case ZIO_TYPE_READ: case ZIO_TYPE_READ:
if (zio->io_priority == ZIO_PRIORITY_SYNC_READ) if ((pri == ZIO_PRIORITY_SYNC_READ) && (v->vdev_nonrot))
flags = READ_SYNC; flags = READ_SYNC;
else else
flags = READ; flags = READ;
@ -730,7 +710,7 @@ vdev_disk_io_start(zio_t *zio)
} }
error = __vdev_disk_physio(vd->vd_bdev, zio, zio->io_data, error = __vdev_disk_physio(vd->vd_bdev, zio, zio->io_data,
zio->io_size, zio->io_offset, flags); zio->io_size, zio->io_offset, flags, 0);
if (error) { if (error) {
zio->io_error = error; zio->io_error = error;
zio_interrupt(zio); zio_interrupt(zio);