deadlock between mm_sem and tx assign in zfs_write() and page fault
The bug time sequence: 1. thread #1, `zfs_write` assign a txg "n". 2. In a same process, thread #2, mmap page fault (which means the `mm_sem` is hold) occurred, `zfs_dirty_inode` open a txg failed, and wait previous txg "n" completed. 3. thread #1 call `uiomove` to write, however page fault is occurred in `uiomove`, which means it need `mm_sem`, but `mm_sem` is hold by thread #2, so it stuck and can't complete, then txg "n" will not complete. So thread #1 and thread #2 are deadlocked. Reviewed-by: Chunwei Chen <tuxoko@gmail.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Matthew Ahrens <mahrens@delphix.com> Signed-off-by: Grady Wong <grady.w@xtaotech.com> Closes #7939
This commit is contained in:
parent
b2030e5d51
commit
779a6c0bf6
|
@ -53,6 +53,7 @@ typedef struct uio {
|
|||
int uio_iovcnt;
|
||||
offset_t uio_loffset;
|
||||
uio_seg_t uio_segflg;
|
||||
boolean_t uio_fault_disable;
|
||||
uint16_t uio_fmode;
|
||||
uint16_t uio_extflg;
|
||||
offset_t uio_limit;
|
||||
|
|
|
@ -42,7 +42,7 @@
|
|||
#include <sys/uio.h>
|
||||
|
||||
extern int uiomove(void *, size_t, enum uio_rw, uio_t *);
|
||||
extern void uio_prefaultpages(ssize_t, uio_t *);
|
||||
extern int uio_prefaultpages(ssize_t, uio_t *);
|
||||
extern int uiocopy(void *, size_t, enum uio_rw, uio_t *, size_t *);
|
||||
extern void uioskip(uio_t *, size_t);
|
||||
|
||||
|
|
|
@ -52,6 +52,7 @@
|
|||
#include <sys/sysmacros.h>
|
||||
#include <sys/strings.h>
|
||||
#include <linux/kmap_compat.h>
|
||||
#include <linux/uaccess.h>
|
||||
|
||||
/*
|
||||
* Move "n" bytes at byte address "p"; "rw" indicates the direction
|
||||
|
@ -79,8 +80,24 @@ uiomove_iov(void *p, size_t n, enum uio_rw rw, struct uio *uio)
|
|||
if (copy_to_user(iov->iov_base+skip, p, cnt))
|
||||
return (EFAULT);
|
||||
} else {
|
||||
if (copy_from_user(p, iov->iov_base+skip, cnt))
|
||||
return (EFAULT);
|
||||
if (uio->uio_fault_disable) {
|
||||
if (!access_ok(VERIFY_READ,
|
||||
(iov->iov_base + skip), cnt)) {
|
||||
return (EFAULT);
|
||||
}
|
||||
|
||||
pagefault_disable();
|
||||
if (__copy_from_user_inatomic(p,
|
||||
(iov->iov_base + skip), cnt)) {
|
||||
pagefault_enable();
|
||||
return (EFAULT);
|
||||
}
|
||||
pagefault_enable();
|
||||
} else {
|
||||
if (copy_from_user(p,
|
||||
(iov->iov_base + skip), cnt))
|
||||
return (EFAULT);
|
||||
}
|
||||
}
|
||||
break;
|
||||
case UIO_SYSSPACE:
|
||||
|
@ -158,7 +175,7 @@ EXPORT_SYMBOL(uiomove);
|
|||
* error will terminate the process as this is only a best attempt to get
|
||||
* the pages resident.
|
||||
*/
|
||||
void
|
||||
int
|
||||
uio_prefaultpages(ssize_t n, struct uio *uio)
|
||||
{
|
||||
const struct iovec *iov;
|
||||
|
@ -172,7 +189,7 @@ uio_prefaultpages(ssize_t n, struct uio *uio)
|
|||
switch (uio->uio_segflg) {
|
||||
case UIO_SYSSPACE:
|
||||
case UIO_BVEC:
|
||||
return;
|
||||
return (0);
|
||||
case UIO_USERSPACE:
|
||||
case UIO_USERISPACE:
|
||||
break;
|
||||
|
@ -196,7 +213,7 @@ uio_prefaultpages(ssize_t n, struct uio *uio)
|
|||
p = iov->iov_base + skip;
|
||||
while (cnt) {
|
||||
if (fuword8((uint8_t *)p, &tmp))
|
||||
return;
|
||||
return (EFAULT);
|
||||
incr = MIN(cnt, PAGESIZE);
|
||||
p += incr;
|
||||
cnt -= incr;
|
||||
|
@ -206,8 +223,10 @@ uio_prefaultpages(ssize_t n, struct uio *uio)
|
|||
*/
|
||||
p--;
|
||||
if (fuword8((uint8_t *)p, &tmp))
|
||||
return;
|
||||
return (EFAULT);
|
||||
}
|
||||
|
||||
return (0);
|
||||
}
|
||||
EXPORT_SYMBOL(uio_prefaultpages);
|
||||
|
||||
|
|
|
@ -650,7 +650,10 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
|
|||
xuio = (xuio_t *)uio;
|
||||
else
|
||||
#endif
|
||||
uio_prefaultpages(MIN(n, max_blksz), uio);
|
||||
if (uio_prefaultpages(MIN(n, max_blksz), uio)) {
|
||||
ZFS_EXIT(zfsvfs);
|
||||
return (SET_ERROR(EFAULT));
|
||||
}
|
||||
|
||||
/*
|
||||
* If in append mode, set the io offset pointer to eof.
|
||||
|
@ -808,8 +811,19 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
|
|||
ssize_t tx_bytes;
|
||||
if (abuf == NULL) {
|
||||
tx_bytes = uio->uio_resid;
|
||||
uio->uio_fault_disable = B_TRUE;
|
||||
error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
|
||||
uio, nbytes, tx);
|
||||
if (error == EFAULT) {
|
||||
dmu_tx_commit(tx);
|
||||
if (uio_prefaultpages(MIN(n, max_blksz), uio)) {
|
||||
break;
|
||||
}
|
||||
continue;
|
||||
} else if (error != 0) {
|
||||
dmu_tx_commit(tx);
|
||||
break;
|
||||
}
|
||||
tx_bytes -= uio->uio_resid;
|
||||
} else {
|
||||
tx_bytes = nbytes;
|
||||
|
@ -909,8 +923,12 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
|
|||
ASSERT(tx_bytes == nbytes);
|
||||
n -= nbytes;
|
||||
|
||||
if (!xuio && n > 0)
|
||||
uio_prefaultpages(MIN(n, max_blksz), uio);
|
||||
if (!xuio && n > 0) {
|
||||
if (uio_prefaultpages(MIN(n, max_blksz), uio)) {
|
||||
error = EFAULT;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
zfs_inode_update(zp);
|
||||
|
|
|
@ -31,74 +31,132 @@
|
|||
#include <string.h>
|
||||
#include <sys/mman.h>
|
||||
#include <pthread.h>
|
||||
#include <errno.h>
|
||||
#include <err.h>
|
||||
|
||||
/*
|
||||
* --------------------------------------------------------------------
|
||||
* Bug Id: 5032643
|
||||
* Bug Issue Id: #7512
|
||||
* The bug time sequence:
|
||||
* 1. context #1, zfs_write assign a txg "n".
|
||||
* 2. In the same process, context #2, mmap page fault (which means the mm_sem
|
||||
* is hold) occurred, zfs_dirty_inode open a txg failed, and wait previous
|
||||
* txg "n" completed.
|
||||
* 3. context #1 call uiomove to write, however page fault is occurred in
|
||||
* uiomove, which means it need mm_sem, but mm_sem is hold by
|
||||
* context #2, so it stuck and can't complete, then txg "n" will not
|
||||
* complete.
|
||||
*
|
||||
* Simply writing to a file and mmaping that file at the same time can
|
||||
* result in deadlock. Nothing perverse like writing from the file's
|
||||
* own mapping is required.
|
||||
* So context #1 and context #2 trap into the "dead lock".
|
||||
* --------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
static void *
|
||||
mapper(void *fdp)
|
||||
{
|
||||
void *addr;
|
||||
int fd = *(int *)fdp;
|
||||
#define NORMAL_WRITE_TH_NUM 2
|
||||
|
||||
if ((addr =
|
||||
mmap(0, 8192, PROT_READ, MAP_SHARED, fd, 0)) == MAP_FAILED) {
|
||||
perror("mmap");
|
||||
exit(1);
|
||||
static void *
|
||||
normal_writer(void *filename)
|
||||
{
|
||||
char *file_path = filename;
|
||||
int fd = -1;
|
||||
ssize_t write_num = 0;
|
||||
int page_size = getpagesize();
|
||||
|
||||
fd = open(file_path, O_RDWR | O_CREAT, 0777);
|
||||
if (fd == -1) {
|
||||
err(1, "failed to open %s", file_path);
|
||||
}
|
||||
for (;;) {
|
||||
if (mmap(addr, 8192, PROT_READ,
|
||||
MAP_SHARED|MAP_FIXED, fd, 0) == MAP_FAILED) {
|
||||
perror("mmap");
|
||||
exit(1);
|
||||
|
||||
char *buf = malloc(1);
|
||||
while (1) {
|
||||
write_num = write(fd, buf, 1);
|
||||
if (write_num == 0) {
|
||||
err(1, "write failed!");
|
||||
break;
|
||||
}
|
||||
lseek(fd, page_size, SEEK_CUR);
|
||||
}
|
||||
|
||||
if (buf) {
|
||||
free(buf);
|
||||
}
|
||||
}
|
||||
|
||||
static void *
|
||||
map_writer(void *filename)
|
||||
{
|
||||
int fd = -1;
|
||||
int ret = 0;
|
||||
char *buf = NULL;
|
||||
int page_size = getpagesize();
|
||||
int op_errno = 0;
|
||||
char *file_path = filename;
|
||||
|
||||
while (1) {
|
||||
ret = access(file_path, F_OK);
|
||||
if (ret) {
|
||||
op_errno = errno;
|
||||
if (op_errno == ENOENT) {
|
||||
fd = open(file_path, O_RDWR | O_CREAT, 0777);
|
||||
if (fd == -1) {
|
||||
err(1, "open file failed");
|
||||
}
|
||||
|
||||
ret = ftruncate(fd, page_size);
|
||||
if (ret == -1) {
|
||||
err(1, "truncate file failed");
|
||||
}
|
||||
} else {
|
||||
err(1, "access file failed!");
|
||||
}
|
||||
} else {
|
||||
fd = open(file_path, O_RDWR, 0777);
|
||||
if (fd == -1) {
|
||||
err(1, "open file failed");
|
||||
}
|
||||
}
|
||||
|
||||
if ((buf = mmap(NULL, page_size, PROT_READ | PROT_WRITE,
|
||||
MAP_SHARED, fd, 0)) == MAP_FAILED) {
|
||||
err(1, "map file failed");
|
||||
}
|
||||
|
||||
if (fd != -1)
|
||||
close(fd);
|
||||
|
||||
char s[10] = {0, };
|
||||
memcpy(buf, s, 10);
|
||||
ret = munmap(buf, page_size);
|
||||
if (ret != 0) {
|
||||
err(1, "unmap file failed");
|
||||
}
|
||||
}
|
||||
/* NOTREACHED */
|
||||
return ((void *)1);
|
||||
}
|
||||
|
||||
int
|
||||
main(int argc, char **argv)
|
||||
{
|
||||
int fd;
|
||||
char buf[1024];
|
||||
pthread_t tid;
|
||||
pthread_t map_write_tid;
|
||||
pthread_t normal_write_tid[NORMAL_WRITE_TH_NUM];
|
||||
int i = 0;
|
||||
|
||||
memset(buf, 'a', sizeof (buf));
|
||||
|
||||
if (argc != 2) {
|
||||
(void) printf("usage: %s <file name>\n", argv[0]);
|
||||
if (argc != 3) {
|
||||
(void) printf("usage: %s <normal write file name>"
|
||||
"<map write file name>\n", argv[0]);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if ((fd = open(argv[1], O_RDWR|O_CREAT|O_TRUNC, 0666)) == -1) {
|
||||
perror("open");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
(void) pthread_setconcurrency(2);
|
||||
if (pthread_create(&tid, NULL, mapper, &fd) != 0) {
|
||||
perror("pthread_create");
|
||||
close(fd);
|
||||
exit(1);
|
||||
}
|
||||
for (;;) {
|
||||
if (write(fd, buf, sizeof (buf)) == -1) {
|
||||
perror("write");
|
||||
close(fd);
|
||||
exit(1);
|
||||
for (i = 0; i < NORMAL_WRITE_TH_NUM; i++) {
|
||||
if (pthread_create(&normal_write_tid[i], NULL, normal_writer,
|
||||
argv[1])) {
|
||||
err(1, "pthread_create normal_writer failed.");
|
||||
}
|
||||
}
|
||||
|
||||
close(fd);
|
||||
if (pthread_create(&map_write_tid, NULL, map_writer, argv[2])) {
|
||||
err(1, "pthread_create map_writer failed.");
|
||||
}
|
||||
|
||||
/* NOTREACHED */
|
||||
pthread_join(map_write_tid, NULL);
|
||||
return (0);
|
||||
}
|
||||
|
|
|
@ -53,12 +53,14 @@ if ! is_mp; then
|
|||
fi
|
||||
|
||||
log_must chmod 777 $TESTDIR
|
||||
mmapwrite $TESTDIR/test-write-file &
|
||||
mmapwrite $TESTDIR/normal_write_file $TESTDIR/map_write_file &
|
||||
PID_MMAPWRITE=$!
|
||||
log_note "mmapwrite $TESTDIR/test-write-file pid: $PID_MMAPWRITE"
|
||||
log_note "mmapwrite $TESTDIR/normal_write_file $TESTDIR/map_write_file"\
|
||||
"pid: $PID_MMAPWRITE"
|
||||
log_must sleep 30
|
||||
|
||||
log_must kill -9 $PID_MMAPWRITE
|
||||
log_must ls -l $TESTDIR/test-write-file
|
||||
log_must ls -l $TESTDIR/normal_write_file
|
||||
log_must ls -l $TESTDIR/map_write_file
|
||||
|
||||
log_pass "write(2) a mmap(2)'ing file succeeded."
|
||||
|
|
Loading…
Reference in New Issue