deadlock between mm_sem and tx assign in zfs_write() and page fault

The bug time sequence:
1. thread #1, `zfs_write` assign a txg "n".
2. In a same process, thread #2, mmap page fault (which means the
   `mm_sem` is hold) occurred, `zfs_dirty_inode` open a txg failed,
   and wait previous txg "n" completed.
3. thread #1 call `uiomove` to write, however page fault is occurred
   in `uiomove`, which means it need `mm_sem`, but `mm_sem` is hold by
   thread #2, so it stuck and can't complete,  then txg "n" will
   not complete.

So thread #1 and thread #2 are deadlocked.

Reviewed-by: Chunwei Chen <tuxoko@gmail.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Signed-off-by: Grady Wong <grady.w@xtaotech.com>
Closes #7939
This commit is contained in:
ilbsmart 2018-10-17 02:11:24 +08:00 committed by Tony Hutter
parent 44f463824b
commit 98bb45e27a
5 changed files with 154 additions and 57 deletions

View File

@ -42,7 +42,7 @@
#include <sys/uio.h> #include <sys/uio.h>
extern int uiomove(void *, size_t, enum uio_rw, uio_t *); extern int uiomove(void *, size_t, enum uio_rw, uio_t *);
extern void uio_prefaultpages(ssize_t, uio_t *); extern int uio_prefaultpages(ssize_t, uio_t *);
extern int uiocopy(void *, size_t, enum uio_rw, uio_t *, size_t *); extern int uiocopy(void *, size_t, enum uio_rw, uio_t *, size_t *);
extern void uioskip(uio_t *, size_t); extern void uioskip(uio_t *, size_t);

View File

@ -50,6 +50,7 @@
#include <sys/types.h> #include <sys/types.h>
#include <sys/uio_impl.h> #include <sys/uio_impl.h>
#include <linux/kmap_compat.h> #include <linux/kmap_compat.h>
#include <linux/uaccess.h>
/* /*
* Move "n" bytes at byte address "p"; "rw" indicates the direction * Move "n" bytes at byte address "p"; "rw" indicates the direction
@ -77,9 +78,25 @@ uiomove_iov(void *p, size_t n, enum uio_rw rw, struct uio *uio)
if (copy_to_user(iov->iov_base+skip, p, cnt)) if (copy_to_user(iov->iov_base+skip, p, cnt))
return (EFAULT); return (EFAULT);
} else { } else {
if (copy_from_user(p, iov->iov_base+skip, cnt)) if (uio->uio_fault_disable) {
if (!access_ok(VERIFY_READ,
(iov->iov_base + skip), cnt)) {
return (EFAULT); return (EFAULT);
} }
pagefault_disable();
if (__copy_from_user_inatomic(p,
(iov->iov_base + skip), cnt)) {
pagefault_enable();
return (EFAULT);
}
pagefault_enable();
} else {
if (copy_from_user(p,
(iov->iov_base + skip), cnt))
return (EFAULT);
}
}
break; break;
case UIO_SYSSPACE: case UIO_SYSSPACE:
if (rw == UIO_READ) if (rw == UIO_READ)
@ -156,7 +173,7 @@ EXPORT_SYMBOL(uiomove);
* error will terminate the process as this is only a best attempt to get * error will terminate the process as this is only a best attempt to get
* the pages resident. * the pages resident.
*/ */
void int
uio_prefaultpages(ssize_t n, struct uio *uio) uio_prefaultpages(ssize_t n, struct uio *uio)
{ {
const struct iovec *iov; const struct iovec *iov;
@ -170,7 +187,7 @@ uio_prefaultpages(ssize_t n, struct uio *uio)
switch (uio->uio_segflg) { switch (uio->uio_segflg) {
case UIO_SYSSPACE: case UIO_SYSSPACE:
case UIO_BVEC: case UIO_BVEC:
return; return (0);
case UIO_USERSPACE: case UIO_USERSPACE:
case UIO_USERISPACE: case UIO_USERISPACE:
break; break;
@ -194,7 +211,7 @@ uio_prefaultpages(ssize_t n, struct uio *uio)
p = iov->iov_base + skip; p = iov->iov_base + skip;
while (cnt) { while (cnt) {
if (fuword8((uint8_t *)p, &tmp)) if (fuword8((uint8_t *)p, &tmp))
return; return (EFAULT);
incr = MIN(cnt, PAGESIZE); incr = MIN(cnt, PAGESIZE);
p += incr; p += incr;
cnt -= incr; cnt -= incr;
@ -204,8 +221,10 @@ uio_prefaultpages(ssize_t n, struct uio *uio)
*/ */
p--; p--;
if (fuword8((uint8_t *)p, &tmp)) if (fuword8((uint8_t *)p, &tmp))
return; return (EFAULT);
} }
return (0);
} }
EXPORT_SYMBOL(uio_prefaultpages); EXPORT_SYMBOL(uio_prefaultpages);

View File

@ -675,7 +675,10 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
xuio = (xuio_t *)uio; xuio = (xuio_t *)uio;
else else
#endif #endif
uio_prefaultpages(MIN(n, max_blksz), uio); if (uio_prefaultpages(MIN(n, max_blksz), uio)) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EFAULT));
}
/* /*
* If in append mode, set the io offset pointer to eof. * If in append mode, set the io offset pointer to eof.
@ -820,8 +823,19 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
if (abuf == NULL) { if (abuf == NULL) {
tx_bytes = uio->uio_resid; tx_bytes = uio->uio_resid;
uio->uio_fault_disable = B_TRUE;
error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
uio, nbytes, tx); uio, nbytes, tx);
if (error == EFAULT) {
dmu_tx_commit(tx);
if (uio_prefaultpages(MIN(n, max_blksz), uio)) {
break;
}
continue;
} else if (error != 0) {
dmu_tx_commit(tx);
break;
}
tx_bytes -= uio->uio_resid; tx_bytes -= uio->uio_resid;
} else { } else {
tx_bytes = nbytes; tx_bytes = nbytes;
@ -921,8 +935,12 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
ASSERT(tx_bytes == nbytes); ASSERT(tx_bytes == nbytes);
n -= nbytes; n -= nbytes;
if (!xuio && n > 0) if (!xuio && n > 0) {
uio_prefaultpages(MIN(n, max_blksz), uio); if (uio_prefaultpages(MIN(n, max_blksz), uio)) {
error = EFAULT;
break;
}
}
} }
zfs_inode_update(zp); zfs_inode_update(zp);

View File

@ -31,74 +31,132 @@
#include <string.h> #include <string.h>
#include <sys/mman.h> #include <sys/mman.h>
#include <pthread.h> #include <pthread.h>
#include <errno.h>
#include <err.h>
/* /*
* -------------------------------------------------------------------- * --------------------------------------------------------------------
* Bug Id: 5032643 * Bug Issue Id: #7512
* The bug time sequence:
* 1. context #1, zfs_write assign a txg "n".
* 2. In the same process, context #2, mmap page fault (which means the mm_sem
* is hold) occurred, zfs_dirty_inode open a txg failed, and wait previous
* txg "n" completed.
* 3. context #1 call uiomove to write, however page fault is occurred in
* uiomove, which means it need mm_sem, but mm_sem is hold by
* context #2, so it stuck and can't complete, then txg "n" will not
* complete.
* *
* Simply writing to a file and mmaping that file at the same time can * So context #1 and context #2 trap into the "dead lock".
* result in deadlock. Nothing perverse like writing from the file's
* own mapping is required.
* -------------------------------------------------------------------- * --------------------------------------------------------------------
*/ */
static void * #define NORMAL_WRITE_TH_NUM 2
mapper(void *fdp)
{
void *addr;
int fd = *(int *)fdp;
if ((addr = static void *
mmap(0, 8192, PROT_READ, MAP_SHARED, fd, 0)) == MAP_FAILED) { normal_writer(void *filename)
perror("mmap"); {
exit(1); char *file_path = filename;
int fd = -1;
ssize_t write_num = 0;
int page_size = getpagesize();
fd = open(file_path, O_RDWR | O_CREAT, 0777);
if (fd == -1) {
err(1, "failed to open %s", file_path);
} }
for (;;) {
if (mmap(addr, 8192, PROT_READ, char *buf = malloc(1);
MAP_SHARED|MAP_FIXED, fd, 0) == MAP_FAILED) { while (1) {
perror("mmap"); write_num = write(fd, buf, 1);
exit(1); if (write_num == 0) {
err(1, "write failed!");
break;
}
lseek(fd, page_size, SEEK_CUR);
}
if (buf) {
free(buf);
}
}
static void *
map_writer(void *filename)
{
int fd = -1;
int ret = 0;
char *buf = NULL;
int page_size = getpagesize();
int op_errno = 0;
char *file_path = filename;
while (1) {
ret = access(file_path, F_OK);
if (ret) {
op_errno = errno;
if (op_errno == ENOENT) {
fd = open(file_path, O_RDWR | O_CREAT, 0777);
if (fd == -1) {
err(1, "open file failed");
}
ret = ftruncate(fd, page_size);
if (ret == -1) {
err(1, "truncate file failed");
}
} else {
err(1, "access file failed!");
}
} else {
fd = open(file_path, O_RDWR, 0777);
if (fd == -1) {
err(1, "open file failed");
}
}
if ((buf = mmap(NULL, page_size, PROT_READ | PROT_WRITE,
MAP_SHARED, fd, 0)) == MAP_FAILED) {
err(1, "map file failed");
}
if (fd != -1)
close(fd);
char s[10] = {0, };
memcpy(buf, s, 10);
ret = munmap(buf, page_size);
if (ret != 0) {
err(1, "unmap file failed");
} }
} }
/* NOTREACHED */
return ((void *)1);
} }
int int
main(int argc, char **argv) main(int argc, char **argv)
{ {
int fd; pthread_t map_write_tid;
char buf[1024]; pthread_t normal_write_tid[NORMAL_WRITE_TH_NUM];
pthread_t tid; int i = 0;
memset(buf, 'a', sizeof (buf)); if (argc != 3) {
(void) printf("usage: %s <normal write file name>"
if (argc != 2) { "<map write file name>\n", argv[0]);
(void) printf("usage: %s <file name>\n", argv[0]);
exit(1); exit(1);
} }
if ((fd = open(argv[1], O_RDWR|O_CREAT|O_TRUNC, 0666)) == -1) { for (i = 0; i < NORMAL_WRITE_TH_NUM; i++) {
perror("open"); if (pthread_create(&normal_write_tid[i], NULL, normal_writer,
exit(1); argv[1])) {
} err(1, "pthread_create normal_writer failed.");
(void) pthread_setconcurrency(2);
if (pthread_create(&tid, NULL, mapper, &fd) != 0) {
perror("pthread_create");
close(fd);
exit(1);
}
for (;;) {
if (write(fd, buf, sizeof (buf)) == -1) {
perror("write");
close(fd);
exit(1);
} }
} }
close(fd); if (pthread_create(&map_write_tid, NULL, map_writer, argv[2])) {
err(1, "pthread_create map_writer failed.");
}
/* NOTREACHED */ /* NOTREACHED */
pthread_join(map_write_tid, NULL);
return (0); return (0);
} }

View File

@ -53,12 +53,14 @@ if ! is_mp; then
fi fi
log_must chmod 777 $TESTDIR log_must chmod 777 $TESTDIR
mmapwrite $TESTDIR/test-write-file & mmapwrite $TESTDIR/normal_write_file $TESTDIR/map_write_file &
PID_MMAPWRITE=$! PID_MMAPWRITE=$!
log_note "mmapwrite $TESTDIR/test-write-file pid: $PID_MMAPWRITE" log_note "mmapwrite $TESTDIR/normal_write_file $TESTDIR/map_write_file"\
"pid: $PID_MMAPWRITE"
log_must sleep 30 log_must sleep 30
log_must kill -9 $PID_MMAPWRITE log_must kill -9 $PID_MMAPWRITE
log_must ls -l $TESTDIR/test-write-file log_must ls -l $TESTDIR/normal_write_file
log_must ls -l $TESTDIR/map_write_file
log_pass "write(2) a mmap(2)'ing file succeeded." log_pass "write(2) a mmap(2)'ing file succeeded."