Illumos 4390 - I/O errors can corrupt space map when deleting fs/vol

4390 i/o errors when deleting filesystem/zvol can lead to space map corruption Reviewed by: George Wilson <george.wilson@delphix.com> Reviewed by: Christopher Siden <christopher.siden@delphix.com> Reviewed by: Adam Leventhal <ahl@delphix.com> Reviewed by: Dan McDonald <danmcd@omniti.com> Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com> Approved by: Dan McDonald <danmcd@omniti.com> References: https://www.illumos.org/issues/4390 https://github.com/illumos/illumos-gate/commit/7fd05ac Porting notes: Previous stack-reduction efforts in traverse_visitb() caused a fair number of un-mergable pieces of code. This patch should reduce its stack footprint a bit more. The new local bptree_entry_phys_t in bptree_add() is dynamically-allocated using kmem_zalloc() for the purpose of stack reduction. The new global zfs_free_leak_on_eio has been defined as an integer rather than a boolean_t as was the case with the related zfs_recover global. Also, zfs_free_leak_on_eio's definition has been inserted into zfs_debug.c for consistency with the existing definition of zfs_recover. Illumos placed it in spa_misc.c. Ported by: Tim Chase <tim@chase2k.com> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #2545
2014-06-05 13:20:08 -08:00 · 2014-06-05 13:20:08 -08:00 · fbeddd60b7
parent 9b67f60560
commit fbeddd60b7
17 changed files with 339 additions and 157 deletions
--- a/include/sys/bptree.h
+++ b/include/sys/bptree.h
@ -19,7 +19,7 @@
 * CDDL HEADER END
 */
 /*
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
 */
 #ifndef	_SYS_BPTREE_H
@ -50,6 +50,7 @@ typedef int bptree_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
 uint64_t bptree_alloc(objset_t *os, dmu_tx_t *tx);
 int bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx);
 boolean_t bptree_is_empty(objset_t *os, uint64_t obj);
 void bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg,
    uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx);
--- a/include/sys/dmu.h
+++ b/include/sys/dmu.h
@ -250,7 +250,6 @@ void zfs_znode_byteswap(void *buf, size_t size);
 #define	DMU_USERUSED_OBJECT	(-1ULL)
 #define	DMU_GROUPUSED_OBJECT	(-2ULL)
 #define	DMU_DEADLIST_OBJECT	(-3ULL)
 /*
 * artificial blkids for bonus buffer and spill blocks
--- a/include/sys/dsl_dir.h
+++ b/include/sys/dsl_dir.h
@ -144,6 +144,7 @@ void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value,
 #define	ORIGIN_DIR_NAME "$ORIGIN"
 #define	XLATION_DIR_NAME "$XLATION"
 #define	FREE_DIR_NAME "$FREE"
 #define	LEAK_DIR_NAME "$LEAK"
 #ifdef ZFS_DEBUG
 #define	dprintf_dd(dd, fmt, ...) do { \
--- a/include/sys/dsl_pool.h
+++ b/include/sys/dsl_pool.h
@ -87,6 +87,7 @@ typedef struct dsl_pool {
 	struct dsl_dir *dp_root_dir;
 	struct dsl_dir *dp_mos_dir;
 	struct dsl_dir *dp_free_dir;
 	struct dsl_dir *dp_leak_dir;
 	struct dsl_dataset *dp_origin_snap;
 	uint64_t dp_root_dir_obj;
 	struct taskq *dp_iput_taskq;
--- a/include/sys/dsl_scan.h
+++ b/include/sys/dsl_scan.h
@ -116,6 +116,7 @@ typedef struct dsl_scan {
 	/* for freeing blocks */
 	boolean_t scn_is_bptree;
 	boolean_t scn_async_destroying;
 	boolean_t scn_async_stalled;
 	/* for debugging / information */
 	uint64_t scn_visited_this_txg;
--- a/include/sys/fs/zfs.h
+++ b/include/sys/fs/zfs.h
@ -193,6 +193,7 @@ typedef enum {
 	ZPOOL_PROP_COMMENT,
 	ZPOOL_PROP_EXPANDSZ,
 	ZPOOL_PROP_FREEING,
 	ZPOOL_PROP_LEAKED,
 	ZPOOL_NUM_PROPS
 } zpool_prop_t;
--- a/include/sys/zfs_debug.h
+++ b/include/sys/zfs_debug.h
@ -48,6 +48,7 @@ extern "C" {
 extern int zfs_flags;
 extern int zfs_recover;
 extern int zfs_free_leak_on_eio;
 #define	ZFS_DEBUG_DPRINTF	(1<<0)
 #define	ZFS_DEBUG_DBUF_VERIFY	(1<<1)
--- a/lib/libzfs/libzfs_pool.c
+++ b/lib/libzfs/libzfs_pool.c
@ -316,6 +316,7 @@ zpool_get_prop_literal(zpool_handle_t *zhp, zpool_prop_t prop, char *buf,
 		case ZPOOL_PROP_ALLOCATED:
 		case ZPOOL_PROP_FREE:
 		case ZPOOL_PROP_FREEING:
 		case ZPOOL_PROP_LEAKED:
 		case ZPOOL_PROP_EXPANDSZ:
 		case ZPOOL_PROP_ASHIFT:
 			if (literal)
--- a/man/man5/zfs-module-parameters.5
+++ b/man/man5/zfs-module-parameters.5
@ -696,6 +696,43 @@ Set additional debugging flags
 Default value: \fB1\fR.
 .RE
 .sp
 .ne 2
 .na
 \fBzfs_free_leak_on_eio\fR (int)
 .ad
 .RS 12n
 If destroy encounters an EIO while reading metadata (e.g. indirect
 blocks), space referenced by the missing metadata can not be freed.
 Normally this causes the background destroy to become "stalled", as
 it is unable to make forward progress.  While in this stalled state,
 all remaining space to free from the error-encountering filesystem is
 "temporarily leaked".  Set this flag to cause it to ignore the EIO,
 permanently leak the space from indirect blocks that can not be read,
 and continue to free everything else that it can.
 The default, "stalling" behavior is useful if the storage partially
 fails (i.e. some but not all i/os fail), and then later recovers.  In
 this case, we will be able to continue pool operations while it is
 partially failed, and when it recovers, we can continue to free the
 space, with no leaks.  However, note that this case is actually
 fairly rare.
 Typically pools either (a) fail completely (but perhaps temporarily,
 e.g. a top-level vdev going offline), or (b) have localized,
 permanent errors (e.g. disk returns the wrong data due to bit flip or
 firmware bug).  In case (a), this setting does not matter because the
 pool will be suspended and the sync thread will not be able to make
 forward progress regardless.  In case (b), because the error is
 permanent, the best we can do is leak the minimum amount of space,
 which is what setting this flag will do.  Therefore, it is reasonable
 for this flag to normally be set, but we chose the more conservative
 approach of not setting it, so that there is no possibility of
 leaking space in the "partial temporary" failure case.
 .sp
 Default value: \fB0\fR.
 .RE
 .sp
 .ne 2
 .na
--- a/module/zcommon/zpool_prop.c
+++ b/module/zcommon/zpool_prop.c
@ -81,6 +81,8 @@ zpool_prop_init(void)
 	    ZFS_TYPE_POOL, "<size>", "FREE");
 	zprop_register_number(ZPOOL_PROP_FREEING, "freeing", 0, PROP_READONLY,
 	    ZFS_TYPE_POOL, "<size>", "FREEING");
 	zprop_register_number(ZPOOL_PROP_LEAKED, "leaked", 0, PROP_READONLY,
 	    ZFS_TYPE_POOL, "<size>", "LEAKED");
 	zprop_register_number(ZPOOL_PROP_ALLOCATED, "allocated", 0,
 	    PROP_READONLY, ZFS_TYPE_POOL, "<size>", "ALLOC");
 	zprop_register_number(ZPOOL_PROP_EXPANDSZ, "expandsize", 0,
--- a/module/zfs/bptree.c
+++ b/module/zfs/bptree.c
@ -102,13 +102,27 @@ bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
 	return (dmu_object_free(os, obj, tx));
 }
 boolean_t
 bptree_is_empty(objset_t *os, uint64_t obj)
 {
 	dmu_buf_t *db;
 	bptree_phys_t *bt;
 	boolean_t rv;
 	VERIFY0(dmu_bonus_hold(os, obj, FTAG, &db));
 	bt = db->db_data;
 	rv = (bt->bt_begin == bt->bt_end);
 	dmu_buf_rele(db, FTAG);
 	return (rv);
 }
 void
 bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg,
    uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx)
 {
 	dmu_buf_t *db;
 	bptree_phys_t *bt;
-	bptree_entry_phys_t bte;
+	bptree_entry_phys_t *bte;
 	/*
 	 * bptree objects are in the pool mos, therefore they can only be
@ -120,10 +134,11 @@ bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg,
 	VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
 	bt = db->db_data;
-	bte.be_birth_txg = birth_txg;
+	bte = kmem_zalloc(sizeof (*bte), KM_PUSHPAGE);
-	bte.be_bp = *bp;
+	bte->be_birth_txg = birth_txg;
-	bzero(&bte.be_zb, sizeof (bte.be_zb));
+	bte->be_bp = *bp;
-	dmu_write(os, obj, bt->bt_end * sizeof (bte), sizeof (bte), &bte, tx);
+	dmu_write(os, obj, bt->bt_end * sizeof (*bte), sizeof (*bte), bte, tx);
 	kmem_free(bte, sizeof (*bte));
 	dmu_buf_will_dirty(db, tx);
 	bt->bt_end++;
@ -153,10 +168,27 @@ bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 	return (err);
 }
 /*
 * If "free" is set:
 *  - It is assumed that "func" will be freeing the block pointers.
 *  - If "func" returns nonzero, the bookmark will be remembered and
 *    iteration will be restarted from this point on next invocation.
 *  - If an i/o error is encountered (e.g. "func" returns EIO or ECKSUM),
 *    bptree_iterate will remember the bookmark, continue traversing
 *    any additional entries, and return 0.
 *
 * If "free" is not set, traversal will stop and return an error if
 * an i/o error is encountered.
 *
 * In either case, if zfs_free_leak_on_eio is set, i/o errors will be
 * ignored and traversal will continue (i.e. TRAVERSE_HARD will be passed to
 * traverse_dataset_destroyed()).
 */
 int
 bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func,
    void *arg, dmu_tx_t *tx)
 {
 	boolean_t ioerr = B_FALSE;
 	int err;
 	uint64_t i;
 	dmu_buf_t *db;
@ -182,49 +214,82 @@ bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func,
 		bptree_entry_phys_t bte;
 		int flags = TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST;
 		ASSERT(!free || i == ba.ba_phys->bt_begin);
 		err = dmu_read(os, obj, i * sizeof (bte), sizeof (bte),
 		    &bte, DMU_READ_NO_PREFETCH);
 		if (err != 0)
 			break;
-		if (zfs_recover)
+		if (zfs_free_leak_on_eio)
 			flags |= TRAVERSE_HARD;
 		zfs_dbgmsg("bptree index %d: traversing from min_txg=%lld "
 		    "bookmark %lld/%lld/%lld/%lld",
 		    i, (longlong_t)bte.be_birth_txg,
 		    (longlong_t)bte.be_zb.zb_objset,
 		    (longlong_t)bte.be_zb.zb_object,
 		    (longlong_t)bte.be_zb.zb_level,
 		    (longlong_t)bte.be_zb.zb_blkid);
 		err = traverse_dataset_destroyed(os->os_spa, &bte.be_bp,
 		    bte.be_birth_txg, &bte.be_zb, flags,
 		    bptree_visit_cb, &ba);
 		if (free) {
-			if (err == ERESTART) {
+			/*
 			 * The callback has freed the visited block pointers.
 			 * Record our traversal progress on disk, either by
 			 * updating this record's bookmark, or by logically
 			 * removing this record by advancing bt_begin.
 			 */
 			if (err != 0) {
 				/* save bookmark for future resume */
 				ASSERT3U(bte.be_zb.zb_objset, ==,
 				    ZB_DESTROYED_OBJSET);
 				ASSERT0(bte.be_zb.zb_level);
 				dmu_write(os, obj, i * sizeof (bte),
 				    sizeof (bte), &bte, tx);
-				break;
+				if (err == EIO || err == ECKSUM ||
-			}
+				    err == ENXIO) {
-			if (err != 0) {
+					/*
 					 * Skip the rest of this tree and
 					 * continue on to the next entry.
 					 */
 					err = 0;
 					ioerr = B_TRUE;
 				} else {
 					break;
 				}
 			} else if (ioerr) {
 				/*
-				 * We can not properly handle an i/o
+				 * This entry is finished, but there were
-				 * error, because the traversal code
+				 * i/o errors on previous entries, so we
-				 * does not know how to resume from an
+				 * can't adjust bt_begin.  Set this entry's
-				 * arbitrary bookmark.
+				 * be_birth_txg such that it will be
 				 * treated as a no-op in future traversals.
 				 */
-				zfs_panic_recover("error %u from "
+				bte.be_birth_txg = UINT64_MAX;
-				    "traverse_dataset_destroyed()", err);
+				dmu_write(os, obj, i * sizeof (bte),
 				    sizeof (bte), &bte, tx);
 			}
-			ba.ba_phys->bt_begin++;
+			if (!ioerr) {
-			(void) dmu_free_range(os, obj,
+				ba.ba_phys->bt_begin++;
-			    i * sizeof (bte), sizeof (bte), tx);
+				(void) dmu_free_range(os, obj,
 				    i * sizeof (bte), sizeof (bte), tx);
 			}
 		} else if (err != 0) {
 			break;
 		}
 	}
-	ASSERT(!free || err != 0 || ba.ba_phys->bt_begin == ba.ba_phys->bt_end);
+	ASSERT(!free || err != 0 || ioerr ||
 	    ba.ba_phys->bt_begin == ba.ba_phys->bt_end);
 	/* if all blocks are free there should be no used space */
 	if (ba.ba_phys->bt_begin == ba.ba_phys->bt_end) {
 		if (zfs_free_leak_on_eio) {
 			ba.ba_phys->bt_bytes = 0;
 			ba.ba_phys->bt_comp = 0;
 			ba.ba_phys->bt_uncomp = 0;
 		}
 		ASSERT0(ba.ba_phys->bt_bytes);
 		ASSERT0(ba.ba_phys->bt_comp);
 		ASSERT0(ba.ba_phys->bt_uncomp);
--- a/module/zfs/dmu_traverse.c
+++ b/module/zfs/dmu_traverse.c
@ -58,12 +58,11 @@ typedef struct traverse_data {
 	zbookmark_t *td_resume;
 	int td_flags;
 	prefetch_data_t *td_pfd;
 	boolean_t td_paused;
 	blkptr_cb_t *td_func;
 	void *td_arg;
 } traverse_data_t;
 #define	TD_HARD(td)	(td->td_flags & TRAVERSE_HARD)
 static int traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
    uint64_t objset, uint64_t object);
 static void prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *,
@ -165,7 +164,6 @@ resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp,
 		 * If we found the block we're trying to resume from, zero
 		 * the bookmark out to indicate that we have resumed.
 		 */
 		ASSERT3U(zb->zb_object, <=, td->td_resume->zb_object);
 		if (bcmp(zb, td->td_resume, sizeof (*zb)) == 0) {
 			bzero(td->td_resume, sizeof (*zb));
 			if (td->td_flags & TRAVERSE_POST)
@ -175,14 +173,6 @@ resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp,
 	return (RESUME_SKIP_NONE);
 }
 static void
 traverse_pause(traverse_data_t *td, const zbookmark_t *zb)
 {
 	ASSERT(td->td_resume != NULL);
 	ASSERT0(zb->zb_level);
 	bcopy(zb, td->td_resume, sizeof (*td->td_resume));
 }
 static void
 traverse_prefetch_metadata(traverse_data_t *td,
    const blkptr_t *bp, const zbookmark_t *zb)
@ -211,9 +201,8 @@ static int
 traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
    const blkptr_t *bp, const zbookmark_t *zb)
 {
-	int err = 0, lasterr = 0;
+	int err = 0;
 	arc_buf_t *buf = NULL;
 	boolean_t pause = B_FALSE;
 	switch (resume_skip_check(td, dnp, zb)) {
 	case RESUME_SKIP_ALL:
@ -252,7 +241,9 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
 	if (BP_IS_HOLE(bp)) {
 		err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
-		return (err);
+		if (err != 0)
 			goto post;
 		return (0);
 	}
 	if (td->td_pfd && !td->td_pfd->pd_exited &&
@ -273,8 +264,6 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
 		    td->td_arg);
 		if (err == TRAVERSE_VISIT_NO_CHILDREN)
 			return (0);
 		if (err == ERESTART)
 			pause = B_TRUE; /* handle pausing at a common point */
 		if (err != 0)
 			goto post;
 	}
@ -288,7 +277,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
 		err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
 		if (err != 0)
-			return (err);
+			goto post;
 		czb = kmem_alloc(sizeof (zbookmark_t), KM_PUSHPAGE);
@ -307,11 +296,8 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
 			    zb->zb_blkid * epb + i);
 			err = traverse_visitbp(td, dnp,
 			    &((blkptr_t *)buf->b_data)[i], czb);
-			if (err != 0) {
+			if (err != 0)
-				if (!TD_HARD(td))
+				break;
 					break;
 				lasterr = err;
 			}
 		}
 		kmem_free(czb, sizeof (zbookmark_t));
@ -324,7 +310,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
 		err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
 		if (err != 0)
-			return (err);
+			goto post;
 		dnp = buf->b_data;
 		for (i = 0; i < epb; i++) {
@ -336,11 +322,8 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
 		for (i = 0; i < epb; i++) {
 			err = traverse_dnode(td, &dnp[i], zb->zb_objset,
 			    zb->zb_blkid * epb + i);
-			if (err != 0) {
+			if (err != 0)
-				if (!TD_HARD(td))
+				break;
 					break;
 				lasterr = err;
 			}
 		}
 	} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
 		uint32_t flags = ARC_WAIT;
@ -350,7 +333,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
 		err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
 		if (err != 0)
-			return (err);
+			goto post;
 		osp = buf->b_data;
 		dnp = &osp->os_meta_dnode;
@ -365,19 +348,11 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
 		err = traverse_dnode(td, dnp, zb->zb_objset,
 		    DMU_META_DNODE_OBJECT);
 		if (err && TD_HARD(td)) {
 			lasterr = err;
 			err = 0;
 		}
 		if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
 			dnp = &osp->os_groupused_dnode;
 			err = traverse_dnode(td, dnp, zb->zb_objset,
 			    DMU_GROUPUSED_OBJECT);
 		}
 		if (err && TD_HARD(td)) {
 			lasterr = err;
 			err = 0;
 		}
 		if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
 			dnp = &osp->os_userused_dnode;
 			err = traverse_dnode(td, dnp, zb->zb_objset,
@ -389,19 +364,37 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
 		(void) arc_buf_remove_ref(buf, &buf);
 post:
-	if (err == 0 && (td->td_flags & TRAVERSE_POST)) {
+	if (err == 0 && (td->td_flags & TRAVERSE_POST))
 		err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
-		if (err == ERESTART)
+
-			pause = B_TRUE;
+	if ((td->td_flags & TRAVERSE_HARD) && (err == EIO || err == ECKSUM)) {
 		/*
 		 * Ignore this disk error as requested by the HARD flag,
 		 * and continue traversal.
 		 */
 		err = 0;
 	}
-	if (pause && td->td_resume != NULL) {
+	/*
-		ASSERT3U(err, ==, ERESTART);
+	 * If we are stopping here, set td_resume.
-		ASSERT(!TD_HARD(td));
+	 */
-		traverse_pause(td, zb);
+	if (td->td_resume != NULL && err != 0 && !td->td_paused) {
 		td->td_resume->zb_objset = zb->zb_objset;
 		td->td_resume->zb_object = zb->zb_object;
 		td->td_resume->zb_level = 0;
 		/*
 		 * If we have stopped on an indirect block (e.g. due to
 		 * i/o error), we have not visited anything below it.
 		 * Set the bookmark to the first level-0 block that we need
 		 * to visit.  This way, the resuming code does not need to
 		 * deal with resuming from indirect blocks.
 		 */
 		td->td_resume->zb_blkid = zb->zb_blkid <<
 		    (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
 		td->td_paused = B_TRUE;
 	}
-	return (err != 0 ? err : lasterr);
+	return (err);
 }
 static void
@ -426,29 +419,21 @@ static int
 traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
    uint64_t objset, uint64_t object)
 {
-	int j, err = 0, lasterr = 0;
+	int j, err = 0;
 	zbookmark_t czb;
 	for (j = 0; j < dnp->dn_nblkptr; j++) {
 		SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
 		err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb);
-		if (err != 0) {
+		if (err != 0)
-			if (!TD_HARD(td))
+			break;
 				break;
 			lasterr = err;
 		}
 	}
 	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
 		SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
 		err = traverse_visitbp(td, dnp, &dnp->dn_spill, &czb);
 		if (err != 0) {
 			if (!TD_HARD(td))
 				return (err);
 			lasterr = err;
 		}
 	}
-	return (err != 0 ? err : lasterr);
+	return (err);
 }
 /* ARGSUSED */
@ -539,6 +524,7 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
 	td->td_arg = arg;
 	td->td_pfd = pd;
 	td->td_flags = flags;
 	td->td_paused = B_FALSE;
 	pd->pd_blks_max = zfs_pd_blks_max;
 	pd->pd_flags = flags;
@ -617,7 +603,7 @@ int
 traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
    blkptr_cb_t func, void *arg)
 {
-	int err, lasterr = 0;
+	int err;
 	uint64_t obj;
 	dsl_pool_t *dp = spa_get_dsl(spa);
 	objset_t *mos = dp->dp_meta_objset;
@ -630,16 +616,15 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
 		return (err);
 	/* visit each dataset */
-	for (obj = 1; err == 0 || (err != ESRCH && hard);
+	for (obj = 1; err == 0;
 	    err = dmu_object_next(mos, &obj, FALSE, txg_start)) {
 		dmu_object_info_t doi;
 		err = dmu_object_info(mos, obj, &doi);
 		if (err != 0) {
-			if (!hard)
+			if (hard)
-				return (err);
+				continue;
-			lasterr = err;
+			break;
 			continue;
 		}
 		if (doi.doi_bonus_type == DMU_OT_DSL_DATASET) {
@ -650,25 +635,21 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
 			err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
 			dsl_pool_config_exit(dp, FTAG);
 			if (err != 0) {
-				if (!hard)
+				if (hard)
-					return (err);
+					continue;
-				lasterr = err;
+				break;
 				continue;
 			}
 			if (ds->ds_phys->ds_prev_snap_txg > txg)
 				txg = ds->ds_phys->ds_prev_snap_txg;
 			err = traverse_dataset(ds, txg, flags, func, arg);
 			dsl_dataset_rele(ds, FTAG);
-			if (err != 0) {
+			if (err != 0)
-				if (!hard)
+				break;
 					return (err);
 				lasterr = err;
 			}
 		}
 	}
 	if (err == ESRCH)
 		err = 0;
-	return (err != 0 ? err : lasterr);
+	return (err);
 }
 #if defined(_KERNEL) && defined(HAVE_SPL)
--- a/module/zfs/dsl_pool.c
+++ b/module/zfs/dsl_pool.c
@ -245,6 +245,13 @@ dsl_pool_open(dsl_pool_t *dp)
 		    dp->dp_meta_objset, obj));
 	}
 	/*
 	 * Note: errors ignored, because the leak dir will not exist if we
 	 * have not encountered a leak yet.
 	 */
 	(void) dsl_pool_open_special_dir(dp, LEAK_DIR_NAME,
 	    &dp->dp_leak_dir);
 	if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) {
 		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
@ -292,6 +299,8 @@ dsl_pool_close(dsl_pool_t *dp)
 		dsl_dir_rele(dp->dp_mos_dir, dp);
 	if (dp->dp_free_dir)
 		dsl_dir_rele(dp->dp_free_dir, dp);
 	if (dp->dp_leak_dir)
 		dsl_dir_rele(dp->dp_leak_dir, dp);
 	if (dp->dp_root_dir)
 		dsl_dir_rele(dp->dp_root_dir, dp);
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@ -65,7 +65,7 @@ int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */
 int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */
 int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */
 int zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
-int zfs_no_scrub_prefetch = B_FALSE; /* set to disable srub prefetching */
+int zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */
 enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
 int dsl_scan_delay_completion = B_FALSE; /* set to delay scan completion */
@ -1417,7 +1417,7 @@ dsl_scan_active(dsl_scan_t *scn)
 	if (spa_shutting_down(spa))
 		return (B_FALSE);
 	if (scn->scn_phys.scn_state == DSS_SCANNING ||
-	    scn->scn_async_destroying)
+	    (scn->scn_async_destroying && !scn->scn_async_stalled))
 		return (B_TRUE);
 	if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
@ -1432,7 +1432,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
 {
 	dsl_scan_t *scn = dp->dp_scan;
 	spa_t *spa = dp->dp_spa;
-	int err;
+	int err = 0;
 	/*
 	 * Check for scn_restart_txg before checking spa_load_state, so
@ -1450,7 +1450,10 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
 		dsl_scan_setup_sync(&func, tx);
 	}
-	if (!dsl_scan_active(scn) ||
+	/*
 	 * If the scan is inactive due to a stalled async destroy, try again.
 	 */
 	if ((!scn->scn_async_stalled && !dsl_scan_active(scn)) ||
 	    spa_sync_pass(dp->dp_spa) > 1)
 		return;
@ -1460,10 +1463,11 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
 	spa->spa_scrub_active = B_TRUE;
 	/*
-	 * First process the free list.  If we pause the free, don't do
+	 * First process the async destroys.  If we pause, don't do
-	 * any scanning.  This ensures that there is no free list when
+	 * any scrubbing or resilvering.  This ensures that there are no
-	 * we are scanning, so the scan code doesn't have to worry about
+	 * async destroys while we are scanning, so the scan code doesn't
-	 * traversing it.
+	 * have to worry about traversing it.  It is also faster to free the
 	 * blocks than to scrub them.
 	 */
 	if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
 		scn->scn_is_bptree = B_FALSE;
@ -1473,48 +1477,92 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
 		    dsl_scan_free_block_cb, scn, tx);
 		VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));
-		if (err == 0 && spa_feature_is_active(spa,
+		if (err != 0 && err != ERESTART)
-		    SPA_FEATURE_ASYNC_DESTROY)) {
+			zfs_panic_recover("error %u from bpobj_iterate()", err);
-			ASSERT(scn->scn_async_destroying);
+	}
 			scn->scn_is_bptree = B_TRUE;
 			scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
 			    NULL, ZIO_FLAG_MUSTSUCCEED);
 			err = bptree_iterate(dp->dp_meta_objset,
 			    dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb,
 			    scn, tx);
 			VERIFY0(zio_wait(scn->scn_zio_root));
-			if (err == 0) {
+	if (err == 0 && spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) {
-				/* finished; deactivate async destroy feature */
+		ASSERT(scn->scn_async_destroying);
-				spa_feature_decr(spa, SPA_FEATURE_ASYNC_DESTROY,
+		scn->scn_is_bptree = B_TRUE;
-				    tx);
+		scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
-				ASSERT(!spa_feature_is_active(spa,
+		    NULL, ZIO_FLAG_MUSTSUCCEED);
-				    SPA_FEATURE_ASYNC_DESTROY));
+		err = bptree_iterate(dp->dp_meta_objset,
-				VERIFY0(zap_remove(dp->dp_meta_objset,
+		    dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, scn, tx);
-				    DMU_POOL_DIRECTORY_OBJECT,
+		VERIFY0(zio_wait(scn->scn_zio_root));
-				    DMU_POOL_BPTREE_OBJ, tx));
+
-				VERIFY0(bptree_free(dp->dp_meta_objset,
+		if (err == EIO || err == ECKSUM) {
-				    dp->dp_bptree_obj, tx));
+			err = 0;
-				dp->dp_bptree_obj = 0;
+		} else if (err != 0 && err != ERESTART) {
-				scn->scn_async_destroying = B_FALSE;
+			zfs_panic_recover("error %u from "
-			}
+			    "traverse_dataset_destroyed()", err);
 		}
-		if (scn->scn_visited_this_txg) {
+
-			zfs_dbgmsg("freed %llu blocks in %llums from "
+		/*
-			    "free_bpobj/bptree txg %llu",
+		 * If we didn't make progress, mark the async destroy as
-			    (longlong_t)scn->scn_visited_this_txg,
+		 * stalled, so that we will not initiate a spa_sync() on
-			    (longlong_t)
+		 * its behalf.
-			    NSEC2MSEC(gethrtime() - scn->scn_sync_start_time),
+		 */
-			    (longlong_t)tx->tx_txg);
+		scn->scn_async_stalled = (scn->scn_visited_this_txg == 0);
-			scn->scn_visited_this_txg = 0;
+
-			/*
+		if (bptree_is_empty(dp->dp_meta_objset, dp->dp_bptree_obj)) {
-			 * Re-sync the ddt so that we can further modify
+			/* finished; deactivate async destroy feature */
-			 * it when doing bprewrite.
+			spa_feature_decr(spa, SPA_FEATURE_ASYNC_DESTROY, tx);
-			 */
+			ASSERT(!spa_feature_is_active(spa,
-			ddt_sync(spa, tx->tx_txg);
+			    SPA_FEATURE_ASYNC_DESTROY));
 			VERIFY0(zap_remove(dp->dp_meta_objset,
 			    DMU_POOL_DIRECTORY_OBJECT,
 			    DMU_POOL_BPTREE_OBJ, tx));
 			VERIFY0(bptree_free(dp->dp_meta_objset,
 			    dp->dp_bptree_obj, tx));
 			dp->dp_bptree_obj = 0;
 			scn->scn_async_destroying = B_FALSE;
 		}
-		if (err == ERESTART)
+	}
-			return;
+	if (scn->scn_visited_this_txg) {
 		zfs_dbgmsg("freed %llu blocks in %llums from "
 		    "free_bpobj/bptree txg %llu; err=%u",
 		    (longlong_t)scn->scn_visited_this_txg,
 		    (longlong_t)
 		    NSEC2MSEC(gethrtime() - scn->scn_sync_start_time),
 		    (longlong_t)tx->tx_txg, err);
 		scn->scn_visited_this_txg = 0;
 		/*
 		 * Write out changes to the DDT that may be required as a
 		 * result of the blocks freed.  This ensures that the DDT
 		 * is clean when a scrub/resilver runs.
 		 */
 		ddt_sync(spa, tx->tx_txg);
 	}
 	if (err != 0)
 		return;
 	if (!scn->scn_async_destroying && zfs_free_leak_on_eio &&
 	    (dp->dp_free_dir->dd_phys->dd_used_bytes != 0 ||
 	    dp->dp_free_dir->dd_phys->dd_compressed_bytes != 0 ||
 	    dp->dp_free_dir->dd_phys->dd_uncompressed_bytes != 0)) {
 		/*
 		 * We have finished background destroying, but there is still
 		 * some space left in the dp_free_dir. Transfer this leaked
 		 * space to the dp_leak_dir.
 		 */
 		if (dp->dp_leak_dir == NULL) {
 			rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
 			(void) dsl_dir_create_sync(dp, dp->dp_root_dir,
 			    LEAK_DIR_NAME, tx);
 			VERIFY0(dsl_pool_open_special_dir(dp,
 			    LEAK_DIR_NAME, &dp->dp_leak_dir));
 			rrw_exit(&dp->dp_config_rwlock, FTAG);
 		}
 		dsl_dir_diduse_space(dp->dp_leak_dir, DD_USED_HEAD,
 		    dp->dp_free_dir->dd_phys->dd_used_bytes,
 		    dp->dp_free_dir->dd_phys->dd_compressed_bytes,
 		    dp->dp_free_dir->dd_phys->dd_uncompressed_bytes, tx);
 		dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
 		    -dp->dp_free_dir->dd_phys->dd_used_bytes,
 		    -dp->dp_free_dir->dd_phys->dd_compressed_bytes,
 		    -dp->dp_free_dir->dd_phys->dd_uncompressed_bytes, tx);
 	}
 	if (!scn->scn_async_destroying) {
 		/* finished; verify that space accounting went to zero */
 		ASSERT0(dp->dp_free_dir->dd_phys->dd_used_bytes);
 		ASSERT0(dp->dp_free_dir->dd_phys->dd_compressed_bytes);
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@ -238,19 +238,25 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
 	}
 	if (pool != NULL) {
 		dsl_dir_t *freedir = pool->dp_free_dir;
 		/*
 		 * The $FREE directory was introduced in SPA_VERSION_DEADLISTS,
 		 * when opening pools before this version freedir will be NULL.
 		 */
-		if (freedir != NULL) {
+		if (pool->dp_free_dir != NULL) {
 			spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL,
-			    freedir->dd_phys->dd_used_bytes, src);
+			    pool->dp_free_dir->dd_phys->dd_used_bytes, src);
 		} else {
 			spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING,
 			    NULL, 0, src);
 		}
 		if (pool->dp_leak_dir != NULL) {
 			spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL,
 			    pool->dp_leak_dir->dd_phys->dd_used_bytes, src);
 		} else {
 			spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED,
 			    NULL, 0, src);
 		}
 	}
 	spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
--- a/module/zfs/zfs_debug.c
+++ b/module/zfs/zfs_debug.c
@ -29,7 +29,7 @@
 list_t zfs_dbgmsgs;
 int zfs_dbgmsg_size;
 kmutex_t zfs_dbgmsgs_lock;
-int zfs_dbgmsg_maxsize = 1<<20; /* 1MB */
+int zfs_dbgmsg_maxsize = 4<<20; /* 4MB */
 #endif
 /*
@ -44,7 +44,38 @@ int zfs_flags = 0;
 * This should only be used as a last resort, as it typically results
 * in leaked space, or worse.
 */
-int zfs_recover = 0;
+int zfs_recover = B_FALSE;
 /*
 * If destroy encounters an EIO while reading metadata (e.g. indirect
 * blocks), space referenced by the missing metadata can not be freed.
 * Normally this causes the background destroy to become "stalled", as
 * it is unable to make forward progress.  While in this stalled state,
 * all remaining space to free from the error-encountering filesystem is
 * "temporarily leaked".  Set this flag to cause it to ignore the EIO,
 * permanently leak the space from indirect blocks that can not be read,
 * and continue to free everything else that it can.
 *
 * The default, "stalling" behavior is useful if the storage partially
 * fails (i.e. some but not all i/os fail), and then later recovers.  In
 * this case, we will be able to continue pool operations while it is
 * partially failed, and when it recovers, we can continue to free the
 * space, with no leaks.  However, note that this case is actually
 * fairly rare.
 *
 * Typically pools either (a) fail completely (but perhaps temporarily,
 * e.g. a top-level vdev going offline), or (b) have localized,
 * permanent errors (e.g. disk returns the wrong data due to bit flip or
 * firmware bug).  In case (a), this setting does not matter because the
 * pool will be suspended and the sync thread will not be able to make
 * forward progress regardless.  In case (b), because the error is
 * permanent, the best we can do is leak the minimum amount of space,
 * which is what setting this flag will do.  Therefore, it is reasonable
 * for this flag to normally be set, but we chose the more conservative
 * approach of not setting it, so that there is no possibility of
 * leaking space in the "partial temporary" failure case.
 */
 int zfs_free_leak_on_eio = B_FALSE;
 void
@ -163,4 +194,8 @@ MODULE_PARM_DESC(zfs_flags, "Set additional debugging flags");
 module_param(zfs_recover, int, 0644);
 MODULE_PARM_DESC(zfs_recover, "Set to attempt to recover from fatal errors");
 module_param(zfs_free_leak_on_eio, int, 0644);
 MODULE_PARM_DESC(zfs_free_leak_on_eio,
 	"Set to ignore IO errors during free and permanently leak the space");
 #endif /* _KERNEL */
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@ -3356,13 +3356,6 @@ zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1,
 	ASSERT(zb1->zb_objset == zb2->zb_objset);
 	ASSERT(zb2->zb_level == 0);
 	/*
 	 * A bookmark in the deadlist is considered to be after
 	 * everything else.
 	 */
 	if (zb2->zb_object == DMU_DEADLIST_OBJECT)
 		return (B_TRUE);
 	/* The objset_phys_t isn't before anything. */
 	if (dnp == NULL)
 		return (B_FALSE);