deadlock between mm_sem and tx assign in zfs_write() and page fault
The bug time sequence: 1. thread #1, `zfs_write` assign a txg "n". 2. In a same process, thread #2, mmap page fault (which means the `mm_sem` is hold) occurred, `zfs_dirty_inode` open a txg failed, and wait previous txg "n" completed. 3. thread #1 call `uiomove` to write, however page fault is occurred in `uiomove`, which means it need `mm_sem`, but `mm_sem` is hold by thread #2, so it stuck and can't complete, then txg "n" will not complete. So thread #1 and thread #2 are deadlocked. Reviewed-by: Chunwei Chen <tuxoko@gmail.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Matthew Ahrens <mahrens@delphix.com> Signed-off-by: Grady Wong <grady.w@xtaotech.com> Closes #7939
This commit is contained in:
parent
44f463824b
commit
98bb45e27a
|
@ -42,7 +42,7 @@
|
||||||
#include <sys/uio.h>
|
#include <sys/uio.h>
|
||||||
|
|
||||||
extern int uiomove(void *, size_t, enum uio_rw, uio_t *);
|
extern int uiomove(void *, size_t, enum uio_rw, uio_t *);
|
||||||
extern void uio_prefaultpages(ssize_t, uio_t *);
|
extern int uio_prefaultpages(ssize_t, uio_t *);
|
||||||
extern int uiocopy(void *, size_t, enum uio_rw, uio_t *, size_t *);
|
extern int uiocopy(void *, size_t, enum uio_rw, uio_t *, size_t *);
|
||||||
extern void uioskip(uio_t *, size_t);
|
extern void uioskip(uio_t *, size_t);
|
||||||
|
|
||||||
|
|
|
@ -50,6 +50,7 @@
|
||||||
#include <sys/types.h>
|
#include <sys/types.h>
|
||||||
#include <sys/uio_impl.h>
|
#include <sys/uio_impl.h>
|
||||||
#include <linux/kmap_compat.h>
|
#include <linux/kmap_compat.h>
|
||||||
|
#include <linux/uaccess.h>
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Move "n" bytes at byte address "p"; "rw" indicates the direction
|
* Move "n" bytes at byte address "p"; "rw" indicates the direction
|
||||||
|
@ -77,8 +78,24 @@ uiomove_iov(void *p, size_t n, enum uio_rw rw, struct uio *uio)
|
||||||
if (copy_to_user(iov->iov_base+skip, p, cnt))
|
if (copy_to_user(iov->iov_base+skip, p, cnt))
|
||||||
return (EFAULT);
|
return (EFAULT);
|
||||||
} else {
|
} else {
|
||||||
if (copy_from_user(p, iov->iov_base+skip, cnt))
|
if (uio->uio_fault_disable) {
|
||||||
return (EFAULT);
|
if (!access_ok(VERIFY_READ,
|
||||||
|
(iov->iov_base + skip), cnt)) {
|
||||||
|
return (EFAULT);
|
||||||
|
}
|
||||||
|
|
||||||
|
pagefault_disable();
|
||||||
|
if (__copy_from_user_inatomic(p,
|
||||||
|
(iov->iov_base + skip), cnt)) {
|
||||||
|
pagefault_enable();
|
||||||
|
return (EFAULT);
|
||||||
|
}
|
||||||
|
pagefault_enable();
|
||||||
|
} else {
|
||||||
|
if (copy_from_user(p,
|
||||||
|
(iov->iov_base + skip), cnt))
|
||||||
|
return (EFAULT);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case UIO_SYSSPACE:
|
case UIO_SYSSPACE:
|
||||||
|
@ -156,7 +173,7 @@ EXPORT_SYMBOL(uiomove);
|
||||||
* error will terminate the process as this is only a best attempt to get
|
* error will terminate the process as this is only a best attempt to get
|
||||||
* the pages resident.
|
* the pages resident.
|
||||||
*/
|
*/
|
||||||
void
|
int
|
||||||
uio_prefaultpages(ssize_t n, struct uio *uio)
|
uio_prefaultpages(ssize_t n, struct uio *uio)
|
||||||
{
|
{
|
||||||
const struct iovec *iov;
|
const struct iovec *iov;
|
||||||
|
@ -170,7 +187,7 @@ uio_prefaultpages(ssize_t n, struct uio *uio)
|
||||||
switch (uio->uio_segflg) {
|
switch (uio->uio_segflg) {
|
||||||
case UIO_SYSSPACE:
|
case UIO_SYSSPACE:
|
||||||
case UIO_BVEC:
|
case UIO_BVEC:
|
||||||
return;
|
return (0);
|
||||||
case UIO_USERSPACE:
|
case UIO_USERSPACE:
|
||||||
case UIO_USERISPACE:
|
case UIO_USERISPACE:
|
||||||
break;
|
break;
|
||||||
|
@ -194,7 +211,7 @@ uio_prefaultpages(ssize_t n, struct uio *uio)
|
||||||
p = iov->iov_base + skip;
|
p = iov->iov_base + skip;
|
||||||
while (cnt) {
|
while (cnt) {
|
||||||
if (fuword8((uint8_t *)p, &tmp))
|
if (fuword8((uint8_t *)p, &tmp))
|
||||||
return;
|
return (EFAULT);
|
||||||
incr = MIN(cnt, PAGESIZE);
|
incr = MIN(cnt, PAGESIZE);
|
||||||
p += incr;
|
p += incr;
|
||||||
cnt -= incr;
|
cnt -= incr;
|
||||||
|
@ -204,8 +221,10 @@ uio_prefaultpages(ssize_t n, struct uio *uio)
|
||||||
*/
|
*/
|
||||||
p--;
|
p--;
|
||||||
if (fuword8((uint8_t *)p, &tmp))
|
if (fuword8((uint8_t *)p, &tmp))
|
||||||
return;
|
return (EFAULT);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return (0);
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL(uio_prefaultpages);
|
EXPORT_SYMBOL(uio_prefaultpages);
|
||||||
|
|
||||||
|
|
|
@ -675,7 +675,10 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
|
||||||
xuio = (xuio_t *)uio;
|
xuio = (xuio_t *)uio;
|
||||||
else
|
else
|
||||||
#endif
|
#endif
|
||||||
uio_prefaultpages(MIN(n, max_blksz), uio);
|
if (uio_prefaultpages(MIN(n, max_blksz), uio)) {
|
||||||
|
ZFS_EXIT(zfsvfs);
|
||||||
|
return (SET_ERROR(EFAULT));
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If in append mode, set the io offset pointer to eof.
|
* If in append mode, set the io offset pointer to eof.
|
||||||
|
@ -820,8 +823,19 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
|
||||||
|
|
||||||
if (abuf == NULL) {
|
if (abuf == NULL) {
|
||||||
tx_bytes = uio->uio_resid;
|
tx_bytes = uio->uio_resid;
|
||||||
|
uio->uio_fault_disable = B_TRUE;
|
||||||
error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
|
error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
|
||||||
uio, nbytes, tx);
|
uio, nbytes, tx);
|
||||||
|
if (error == EFAULT) {
|
||||||
|
dmu_tx_commit(tx);
|
||||||
|
if (uio_prefaultpages(MIN(n, max_blksz), uio)) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
} else if (error != 0) {
|
||||||
|
dmu_tx_commit(tx);
|
||||||
|
break;
|
||||||
|
}
|
||||||
tx_bytes -= uio->uio_resid;
|
tx_bytes -= uio->uio_resid;
|
||||||
} else {
|
} else {
|
||||||
tx_bytes = nbytes;
|
tx_bytes = nbytes;
|
||||||
|
@ -921,8 +935,12 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
|
||||||
ASSERT(tx_bytes == nbytes);
|
ASSERT(tx_bytes == nbytes);
|
||||||
n -= nbytes;
|
n -= nbytes;
|
||||||
|
|
||||||
if (!xuio && n > 0)
|
if (!xuio && n > 0) {
|
||||||
uio_prefaultpages(MIN(n, max_blksz), uio);
|
if (uio_prefaultpages(MIN(n, max_blksz), uio)) {
|
||||||
|
error = EFAULT;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
zfs_inode_update(zp);
|
zfs_inode_update(zp);
|
||||||
|
|
|
@ -31,74 +31,132 @@
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <sys/mman.h>
|
#include <sys/mman.h>
|
||||||
#include <pthread.h>
|
#include <pthread.h>
|
||||||
|
#include <errno.h>
|
||||||
|
#include <err.h>
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* --------------------------------------------------------------------
|
* --------------------------------------------------------------------
|
||||||
* Bug Id: 5032643
|
* Bug Issue Id: #7512
|
||||||
|
* The bug time sequence:
|
||||||
|
* 1. context #1, zfs_write assign a txg "n".
|
||||||
|
* 2. In the same process, context #2, mmap page fault (which means the mm_sem
|
||||||
|
* is hold) occurred, zfs_dirty_inode open a txg failed, and wait previous
|
||||||
|
* txg "n" completed.
|
||||||
|
* 3. context #1 call uiomove to write, however page fault is occurred in
|
||||||
|
* uiomove, which means it need mm_sem, but mm_sem is hold by
|
||||||
|
* context #2, so it stuck and can't complete, then txg "n" will not
|
||||||
|
* complete.
|
||||||
*
|
*
|
||||||
* Simply writing to a file and mmaping that file at the same time can
|
* So context #1 and context #2 trap into the "dead lock".
|
||||||
* result in deadlock. Nothing perverse like writing from the file's
|
|
||||||
* own mapping is required.
|
|
||||||
* --------------------------------------------------------------------
|
* --------------------------------------------------------------------
|
||||||
*/
|
*/
|
||||||
|
|
||||||
static void *
|
#define NORMAL_WRITE_TH_NUM 2
|
||||||
mapper(void *fdp)
|
|
||||||
{
|
|
||||||
void *addr;
|
|
||||||
int fd = *(int *)fdp;
|
|
||||||
|
|
||||||
if ((addr =
|
static void *
|
||||||
mmap(0, 8192, PROT_READ, MAP_SHARED, fd, 0)) == MAP_FAILED) {
|
normal_writer(void *filename)
|
||||||
perror("mmap");
|
{
|
||||||
exit(1);
|
char *file_path = filename;
|
||||||
|
int fd = -1;
|
||||||
|
ssize_t write_num = 0;
|
||||||
|
int page_size = getpagesize();
|
||||||
|
|
||||||
|
fd = open(file_path, O_RDWR | O_CREAT, 0777);
|
||||||
|
if (fd == -1) {
|
||||||
|
err(1, "failed to open %s", file_path);
|
||||||
}
|
}
|
||||||
for (;;) {
|
|
||||||
if (mmap(addr, 8192, PROT_READ,
|
char *buf = malloc(1);
|
||||||
MAP_SHARED|MAP_FIXED, fd, 0) == MAP_FAILED) {
|
while (1) {
|
||||||
perror("mmap");
|
write_num = write(fd, buf, 1);
|
||||||
exit(1);
|
if (write_num == 0) {
|
||||||
|
err(1, "write failed!");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
lseek(fd, page_size, SEEK_CUR);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (buf) {
|
||||||
|
free(buf);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void *
|
||||||
|
map_writer(void *filename)
|
||||||
|
{
|
||||||
|
int fd = -1;
|
||||||
|
int ret = 0;
|
||||||
|
char *buf = NULL;
|
||||||
|
int page_size = getpagesize();
|
||||||
|
int op_errno = 0;
|
||||||
|
char *file_path = filename;
|
||||||
|
|
||||||
|
while (1) {
|
||||||
|
ret = access(file_path, F_OK);
|
||||||
|
if (ret) {
|
||||||
|
op_errno = errno;
|
||||||
|
if (op_errno == ENOENT) {
|
||||||
|
fd = open(file_path, O_RDWR | O_CREAT, 0777);
|
||||||
|
if (fd == -1) {
|
||||||
|
err(1, "open file failed");
|
||||||
|
}
|
||||||
|
|
||||||
|
ret = ftruncate(fd, page_size);
|
||||||
|
if (ret == -1) {
|
||||||
|
err(1, "truncate file failed");
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
err(1, "access file failed!");
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
fd = open(file_path, O_RDWR, 0777);
|
||||||
|
if (fd == -1) {
|
||||||
|
err(1, "open file failed");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((buf = mmap(NULL, page_size, PROT_READ | PROT_WRITE,
|
||||||
|
MAP_SHARED, fd, 0)) == MAP_FAILED) {
|
||||||
|
err(1, "map file failed");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (fd != -1)
|
||||||
|
close(fd);
|
||||||
|
|
||||||
|
char s[10] = {0, };
|
||||||
|
memcpy(buf, s, 10);
|
||||||
|
ret = munmap(buf, page_size);
|
||||||
|
if (ret != 0) {
|
||||||
|
err(1, "unmap file failed");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/* NOTREACHED */
|
|
||||||
return ((void *)1);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int
|
int
|
||||||
main(int argc, char **argv)
|
main(int argc, char **argv)
|
||||||
{
|
{
|
||||||
int fd;
|
pthread_t map_write_tid;
|
||||||
char buf[1024];
|
pthread_t normal_write_tid[NORMAL_WRITE_TH_NUM];
|
||||||
pthread_t tid;
|
int i = 0;
|
||||||
|
|
||||||
memset(buf, 'a', sizeof (buf));
|
if (argc != 3) {
|
||||||
|
(void) printf("usage: %s <normal write file name>"
|
||||||
if (argc != 2) {
|
"<map write file name>\n", argv[0]);
|
||||||
(void) printf("usage: %s <file name>\n", argv[0]);
|
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((fd = open(argv[1], O_RDWR|O_CREAT|O_TRUNC, 0666)) == -1) {
|
for (i = 0; i < NORMAL_WRITE_TH_NUM; i++) {
|
||||||
perror("open");
|
if (pthread_create(&normal_write_tid[i], NULL, normal_writer,
|
||||||
exit(1);
|
argv[1])) {
|
||||||
}
|
err(1, "pthread_create normal_writer failed.");
|
||||||
|
|
||||||
(void) pthread_setconcurrency(2);
|
|
||||||
if (pthread_create(&tid, NULL, mapper, &fd) != 0) {
|
|
||||||
perror("pthread_create");
|
|
||||||
close(fd);
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
for (;;) {
|
|
||||||
if (write(fd, buf, sizeof (buf)) == -1) {
|
|
||||||
perror("write");
|
|
||||||
close(fd);
|
|
||||||
exit(1);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
close(fd);
|
if (pthread_create(&map_write_tid, NULL, map_writer, argv[2])) {
|
||||||
|
err(1, "pthread_create map_writer failed.");
|
||||||
|
}
|
||||||
|
|
||||||
/* NOTREACHED */
|
/* NOTREACHED */
|
||||||
|
pthread_join(map_write_tid, NULL);
|
||||||
return (0);
|
return (0);
|
||||||
}
|
}
|
||||||
|
|
|
@ -53,12 +53,14 @@ if ! is_mp; then
|
||||||
fi
|
fi
|
||||||
|
|
||||||
log_must chmod 777 $TESTDIR
|
log_must chmod 777 $TESTDIR
|
||||||
mmapwrite $TESTDIR/test-write-file &
|
mmapwrite $TESTDIR/normal_write_file $TESTDIR/map_write_file &
|
||||||
PID_MMAPWRITE=$!
|
PID_MMAPWRITE=$!
|
||||||
log_note "mmapwrite $TESTDIR/test-write-file pid: $PID_MMAPWRITE"
|
log_note "mmapwrite $TESTDIR/normal_write_file $TESTDIR/map_write_file"\
|
||||||
|
"pid: $PID_MMAPWRITE"
|
||||||
log_must sleep 30
|
log_must sleep 30
|
||||||
|
|
||||||
log_must kill -9 $PID_MMAPWRITE
|
log_must kill -9 $PID_MMAPWRITE
|
||||||
log_must ls -l $TESTDIR/test-write-file
|
log_must ls -l $TESTDIR/normal_write_file
|
||||||
|
log_must ls -l $TESTDIR/map_write_file
|
||||||
|
|
||||||
log_pass "write(2) a mmap(2)'ing file succeeded."
|
log_pass "write(2) a mmap(2)'ing file succeeded."
|
||||||
|
|
Loading…
Reference in New Issue