From f763c3d1df569a8d6b60bcb5e95cf07aa7a189e6 Mon Sep 17 00:00:00 2001 From: LOLi Date: Mon, 21 Aug 2017 17:59:48 +0200 Subject: [PATCH] Fix range locking in ZIL commit codepath Since OpenZFS 7578 (1b7c1e5) if we have a ZVOL with logbias=throughput we will force WR_INDIRECT itxs in zvol_log_write() setting itx->itx_lr offset and length to the offset and length of the BIO from zvol_write()->zvol_log_write(): these offset and length are later used to take a range lock in zillog->zl_get_data function: zvol_get_data(). Now suppose we have a ZVOL with blocksize=8K and push 4K writes to offset 0: we will only be range-locking 0-4096. This means the ASSERTion we make in dbuf_unoverride() is no longer valid because now dmu_sync() is called from zilog's get_data functions holding a partial lock on the dbuf. Fix this by taking a range lock on the whole block in zvol_get_data(). Reviewed-by: Chunwei Chen Reviewed-by: Brian Behlendorf Signed-off-by: loli10K Closes #6238 Closes #6315 Closes #6356 Closes #6477 --- cmd/ztest/ztest.c | 2 + module/zfs/dmu.c | 6 ++ module/zfs/zfs_vnops.c | 2 +- module/zfs/zvol.c | 48 +++++++++--- tests/runfiles/linux.run | 2 +- .../functional/zvol/zvol_misc/Makefile.am | 3 +- .../zvol/zvol_misc/zvol_misc_zil.ksh | 74 +++++++++++++++++++ 7 files changed, 123 insertions(+), 14 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_zil.ksh diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index 3acfaecaff..8ea263dea9 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -2145,6 +2145,7 @@ ztest_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) if (buf != NULL) { /* immediate write */ zgd_private->z_rl = ztest_range_lock(zd, object, offset, size, RL_READER); + zgd->zgd_rl = zgd_private->z_rl->z_rl; error = dmu_read(os, object, offset, size, buf, DMU_READ_NO_PREFETCH); @@ -2160,6 +2161,7 @@ ztest_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) zgd_private->z_rl = ztest_range_lock(zd, object, offset, size, RL_READER); + zgd->zgd_rl = zgd_private->z_rl->z_rl; error = dmu_buf_hold(os, object, offset, zgd, &db, DMU_READ_NO_PREFETCH); diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index e098a4966b..5b90818f4e 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -49,6 +49,7 @@ #include #include #include +#include #ifdef _KERNEL #include #include @@ -1815,6 +1816,11 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) ASSERT(pio != NULL); ASSERT(txg != 0); + /* dbuf is within the locked range */ + ASSERT3U(db->db.db_offset, >=, zgd->zgd_rl->r_off); + ASSERT3U(db->db.db_offset + db->db.db_size, <=, + zgd->zgd_rl->r_off + zgd->zgd_rl->r_len); + SET_BOOKMARK(&zb, ds->ds_object, db->db.db_object, db->db_level, db->db_blkid); diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 53c5e4f238..ffeac7730e 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -1050,7 +1050,7 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) } else { /* indirect write */ /* * Have to lock the whole block to ensure when it's - * written out and it's checksum is being calculated + * written out and its checksum is being calculated * that no one can change the data. We need to re-check * blocksize after we get the lock in case it's changed! */ diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index 4d11b52ab3..60fab5cc68 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -821,6 +821,7 @@ zvol_discard(void *arg) uint64_t start = BIO_BI_SECTOR(bio) << 9; uint64_t size = BIO_BI_SIZE(bio); uint64_t end = start + size; + boolean_t sync; int error = 0; dmu_tx_t *tx; unsigned long start_jif; @@ -830,9 +831,11 @@ zvol_discard(void *arg) start_jif = jiffies; generic_start_io_acct(WRITE, bio_sectors(bio), &zv->zv_disk->part0); + sync = bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; + if (end > zv->zv_volsize) { error = SET_ERROR(EIO); - goto out; + goto unlock; } /* @@ -848,7 +851,7 @@ zvol_discard(void *arg) } if (start >= end) - goto out; + goto unlock; tx = dmu_tx_create(zv->zv_objset); dmu_tx_mark_netfree(tx); @@ -861,9 +864,11 @@ zvol_discard(void *arg) error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, start, size); } - -out: +unlock: zfs_range_unlock(zvr->rl); + if (error == 0 && sync) + zil_commit(zv->zv_zilog, ZVOL_OBJ); + rw_exit(&zv->zv_suspend_lock); generic_end_io_acct(WRITE, &zv->zv_disk->part0, start_jif); BIO_END_IO(bio, -error); @@ -933,6 +938,8 @@ zvol_request(struct request_queue *q, struct bio *bio) } if (rw == WRITE) { + boolean_t need_sync = B_FALSE; + if (unlikely(zv->zv_flags & ZVOL_RDONLY)) { BIO_END_IO(bio, -SET_ERROR(EROFS)); goto out; @@ -966,13 +973,24 @@ zvol_request(struct request_queue *q, struct bio *bio) */ zvr->rl = zfs_range_lock(&zv->zv_range_lock, offset, size, RL_WRITER); + /* + * Sync writes and discards execute zil_commit() which may need + * to take a RL_READER lock on the whole block being modified + * via its zillog->zl_get_data(): to avoid circular dependency + * issues with taskq threads execute these requests + * synchronously here in zvol_request(). + */ + need_sync = bio_is_fua(bio) || + zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; if (bio_is_discard(bio) || bio_is_secure_erase(bio)) { - if (zvol_request_sync || taskq_dispatch(zvol_taskq, - zvol_discard, zvr, TQ_SLEEP) == TASKQID_INVALID) + if (zvol_request_sync || need_sync || + taskq_dispatch(zvol_taskq, zvol_discard, zvr, + TQ_SLEEP) == TASKQID_INVALID) zvol_discard(zvr); } else { - if (zvol_request_sync || taskq_dispatch(zvol_taskq, - zvol_write, zvr, TQ_SLEEP) == TASKQID_INVALID) + if (zvol_request_sync || need_sync || + taskq_dispatch(zvol_taskq, zvol_write, zvr, + TQ_SLEEP) == TASKQID_INVALID) zvol_write(zvr); } } else { @@ -1030,8 +1048,6 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP); zgd->zgd_zilog = zv->zv_zilog; - zgd->zgd_rl = zfs_range_lock(&zv->zv_range_lock, offset, size, - RL_READER); /* * Write records come in two flavors: immediate and indirect. @@ -1041,11 +1057,21 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) * we don't have to write the data twice. */ if (buf != NULL) { /* immediate write */ + zgd->zgd_rl = zfs_range_lock(&zv->zv_range_lock, offset, size, + RL_READER); error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf, DMU_READ_NO_PREFETCH); - } else { + } else { /* indirect write */ + /* + * Have to lock the whole block to ensure when it's written out + * and its checksum is being calculated that no one can change + * the data. Contrarily to zfs_get_data we need not re-check + * blocksize after we get the lock because it cannot be changed. + */ size = zv->zv_volblocksize; offset = P2ALIGN_TYPED(offset, size, uint64_t); + zgd->zgd_rl = zfs_range_lock(&zv->zv_range_lock, offset, size, + RL_READER); error = dmu_buf_hold_by_dnode(zv->zv_dn, offset, zgd, &db, DMU_READ_NO_PREFETCH); if (error == 0) { diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index cb8e2841f6..0cbbfa03e5 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -594,7 +594,7 @@ tests = ['zvol_cli_001_pos', 'zvol_cli_002_pos', 'zvol_cli_003_neg'] [tests/functional/zvol/zvol_misc] tests = ['zvol_misc_001_neg', 'zvol_misc_002_pos', 'zvol_misc_003_neg', 'zvol_misc_004_pos', 'zvol_misc_005_neg', 'zvol_misc_006_pos', - 'zvol_misc_snapdev', 'zvol_misc_volmode'] + 'zvol_misc_snapdev', 'zvol_misc_volmode', 'zvol_misc_zil'] [tests/functional/zvol/zvol_swap] tests = ['zvol_swap_001_pos', 'zvol_swap_002_pos', 'zvol_swap_003_pos', diff --git a/tests/zfs-tests/tests/functional/zvol/zvol_misc/Makefile.am b/tests/zfs-tests/tests/functional/zvol/zvol_misc/Makefile.am index 30ef9acbd0..8859ff3598 100644 --- a/tests/zfs-tests/tests/functional/zvol/zvol_misc/Makefile.am +++ b/tests/zfs-tests/tests/functional/zvol/zvol_misc/Makefile.am @@ -10,4 +10,5 @@ dist_pkgdata_SCRIPTS = \ zvol_misc_005_neg.ksh \ zvol_misc_006_pos.ksh \ zvol_misc_snapdev.ksh \ - zvol_misc_volmode.ksh + zvol_misc_volmode.ksh \ + zvol_misc_zil.ksh diff --git a/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_zil.ksh b/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_zil.ksh new file mode 100755 index 0000000000..b8989f4787 --- /dev/null +++ b/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_zil.ksh @@ -0,0 +1,74 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2017, loli10K . All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/zvol/zvol_common.shlib +. $STF_SUITE/tests/functional/zvol/zvol_misc/zvol_misc_common.kshlib + +# +# DESCRIPTION: +# Verify ZIL functionality on ZVOLs +# +# STRATEGY: +# 1. Create a ZVOLs with various combination of "logbias" and "sync" values +# 2. Write data to ZVOL device node +# 3. Verify we don't trigger any issue like the one reported in #6238 +# + +verify_runnable "global" + +function cleanup +{ + datasetexists $ZVOL && log_must_busy zfs destroy $ZVOL + udev_wait +} + +log_assert "Verify ZIL functionality on ZVOLs" +log_onexit cleanup + +ZVOL="$TESTPOOL/vol" +ZDEV="$ZVOL_DEVDIR/$ZVOL" +typeset -a logbias_prop_vals=('latency' 'throughput') +typeset -a sync_prop_vals=('standard' 'always' 'disabled') + +for logbias in ${logbias_prop_vals[@]}; do + for sync in ${sync_prop_vals[@]}; do + # 1. Create a ZVOL with logbias=throughput and sync=always + log_must zfs create -V $VOLSIZE -b 128K -o sync=$sync \ + -o logbias=$logbias $ZVOL + + # 2. Write data to its device node + for i in {1..50}; do + dd if=/dev/zero of=$ZDEV bs=8k count=1 & + done + + # 3. Verify we don't trigger any issue + log_must wait + log_must_busy zfs destroy $ZVOL + done +done + +log_pass "ZIL functionality works on ZVOLs"