diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index b7a4adae9c..dc8193d8c8 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -36,6 +36,29 @@ #include #include +struct dbuf_hold_impl_data { + /* Function arguments */ + dnode_t *dh_dn; + uint8_t dh_level; + uint64_t dh_blkid; + int dh_fail_sparse; + void *dh_tag; + dmu_buf_impl_t **dh_dbp; + /* Local variables */ + dmu_buf_impl_t *dh_db; + dmu_buf_impl_t *dh_parent; + blkptr_t *dh_bp; + int dh_err; + dbuf_dirty_record_t *dh_dr; + arc_buf_contents_t dh_type; + int dh_depth; +}; + +static void __dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh, + dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, + void *tag, dmu_buf_impl_t **dbp, int depth); +static int __dbuf_hold_impl(struct dbuf_hold_impl_data *dh); + static void dbuf_destroy(dmu_buf_impl_t *db); static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); @@ -1504,7 +1527,7 @@ dbuf_clear(dmu_buf_impl_t *db) __attribute__((always_inline)) static inline int dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, - dmu_buf_impl_t **parentp, blkptr_t **bpp) + dmu_buf_impl_t **parentp, blkptr_t **bpp, struct dbuf_hold_impl_data *dh) { int nlevels, epbs; @@ -1541,8 +1564,17 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, return (ENOENT); } else if (level < nlevels-1) { /* this block is referenced from an indirect block */ - int err = dbuf_hold_impl(dn, level+1, - blkid >> epbs, fail_sparse, NULL, parentp); + int err; + if (dh == NULL) { + err = dbuf_hold_impl(dn, level+1, blkid >> epbs, + fail_sparse, NULL, parentp); + } + else { + __dbuf_hold_impl_init(dh + 1, dn, dh->dh_level + 1, + blkid >> epbs, fail_sparse, NULL, + parentp, dh->dh_depth + 1); + err = __dbuf_hold_impl(dh + 1); + } if (err) return (err); err = dbuf_read(*parentp, NULL, @@ -1735,7 +1767,7 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid) db = NULL; } - if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) { + if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp, NULL) == 0) { if (bp && !BP_IS_HOLE(bp)) { int priority = dn->dn_type == DMU_OT_DDT_ZAP ? ZIO_PRIORITY_DDT_PREFETCH : ZIO_PRIORITY_ASYNC_READ; @@ -1762,98 +1794,142 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid) } } +#define DBUF_HOLD_IMPL_MAX_DEPTH 20 + /* * Returns with db_holds incremented, and db_mtx not held. * Note: dn_struct_rwlock must be held. */ -int -dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, - void *tag, dmu_buf_impl_t **dbp) +static int +__dbuf_hold_impl(struct dbuf_hold_impl_data *dh) { - dmu_buf_impl_t *db, *parent = NULL; + ASSERT3S(dh->dh_depth, <, DBUF_HOLD_IMPL_MAX_DEPTH); + dh->dh_parent = NULL; - ASSERT(blkid != DMU_BONUS_BLKID); - ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); - ASSERT3U(dn->dn_nlevels, >, level); + ASSERT(dh->dh_blkid != DMU_BONUS_BLKID); + ASSERT(RW_LOCK_HELD(&dh->dh_dn->dn_struct_rwlock)); + ASSERT3U(dh->dh_dn->dn_nlevels, >, dh->dh_level); - *dbp = NULL; + *(dh->dh_dbp) = NULL; top: /* dbuf_find() returns with db_mtx held */ - db = dbuf_find(dn, level, blkid); + dh->dh_db = dbuf_find(dh->dh_dn, dh->dh_level, dh->dh_blkid); - if (db == NULL) { - blkptr_t *bp = NULL; - int err; + if (dh->dh_db == NULL) { + dh->dh_bp = NULL; - ASSERT3P(parent, ==, NULL); - err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); - if (fail_sparse) { - if (err == 0 && bp && BP_IS_HOLE(bp)) - err = ENOENT; - if (err) { - if (parent) - dbuf_rele(parent, NULL); - return (err); + ASSERT3P(dh->dh_parent, ==, NULL); + dh->dh_err = dbuf_findbp(dh->dh_dn, dh->dh_level, dh->dh_blkid, + dh->dh_fail_sparse, &dh->dh_parent, + &dh->dh_bp, dh); + if (dh->dh_fail_sparse) { + if (dh->dh_err == 0 && dh->dh_bp && BP_IS_HOLE(dh->dh_bp)) + dh->dh_err = ENOENT; + if (dh->dh_err) { + if (dh->dh_parent) + dbuf_rele(dh->dh_parent, NULL); + return (dh->dh_err); } } - if (err && err != ENOENT) - return (err); - db = dbuf_create(dn, level, blkid, parent, bp); + if (dh->dh_err && dh->dh_err != ENOENT) + return (dh->dh_err); + dh->dh_db = dbuf_create(dh->dh_dn, dh->dh_level, dh->dh_blkid, + dh->dh_parent, dh->dh_bp); } - if (db->db_buf && refcount_is_zero(&db->db_holds)) { - arc_buf_add_ref(db->db_buf, db); - if (db->db_buf->b_data == NULL) { - dbuf_clear(db); - if (parent) { - dbuf_rele(parent, NULL); - parent = NULL; + if (dh->dh_db->db_buf && refcount_is_zero(&dh->dh_db->db_holds)) { + arc_buf_add_ref(dh->dh_db->db_buf, dh->dh_db); + if (dh->dh_db->db_buf->b_data == NULL) { + dbuf_clear(dh->dh_db); + if (dh->dh_parent) { + dbuf_rele(dh->dh_parent, NULL); + dh->dh_parent = NULL; } goto top; } - ASSERT3P(db->db.db_data, ==, db->db_buf->b_data); + ASSERT3P(dh->dh_db->db.db_data, ==, dh->dh_db->db_buf->b_data); } - ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf)); + ASSERT(dh->dh_db->db_buf == NULL || arc_referenced(dh->dh_db->db_buf)); /* * If this buffer is currently syncing out, and we are are * still referencing it from db_data, we need to make a copy * of it in case we decide we want to dirty it again in this txg. */ - if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && - dn->dn_object != DMU_META_DNODE_OBJECT && - db->db_state == DB_CACHED && db->db_data_pending) { - dbuf_dirty_record_t *dr = db->db_data_pending; + if (dh->dh_db->db_level == 0 && + dh->dh_db->db_blkid != DMU_BONUS_BLKID && + dh->dh_dn->dn_object != DMU_META_DNODE_OBJECT && + dh->dh_db->db_state == DB_CACHED && dh->dh_db->db_data_pending) { + dh->dh_dr = dh->dh_db->db_data_pending; - if (dr->dt.dl.dr_data == db->db_buf) { - arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); + if (dh->dh_dr->dt.dl.dr_data == dh->dh_db->db_buf) { + dh->dh_type = DBUF_GET_BUFC_TYPE(dh->dh_db); - dbuf_set_data(db, - arc_buf_alloc(db->db_dnode->dn_objset->os_spa, - db->db.db_size, db, type)); - bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data, - db->db.db_size); + dbuf_set_data(dh->dh_db, + arc_buf_alloc(dh->dh_db->db_dnode->dn_objset->os_spa, + dh->dh_db->db.db_size, dh->dh_db, dh->dh_type)); + bcopy(dh->dh_dr->dt.dl.dr_data->b_data, dh->dh_db->db.db_data, + dh->dh_db->db.db_size); } } - (void) refcount_add(&db->db_holds, tag); - dbuf_update_data(db); - DBUF_VERIFY(db); - mutex_exit(&db->db_mtx); + (void) refcount_add(&dh->dh_db->db_holds, dh->dh_tag); + dbuf_update_data(dh->dh_db); + DBUF_VERIFY(dh->dh_db); + mutex_exit(&dh->dh_db->db_mtx); /* NOTE: we can't rele the parent until after we drop the db_mtx */ - if (parent) - dbuf_rele(parent, NULL); + if (dh->dh_parent) + dbuf_rele(dh->dh_parent, NULL); - ASSERT3P(db->db_dnode, ==, dn); - ASSERT3U(db->db_blkid, ==, blkid); - ASSERT3U(db->db_level, ==, level); - *dbp = db; + ASSERT3P(dh->dh_db->db_dnode, ==, dh->dh_dn); + ASSERT3U(dh->dh_db->db_blkid, ==, dh->dh_blkid); + ASSERT3U(dh->dh_db->db_level, ==, dh->dh_level); + *(dh->dh_dbp) = dh->dh_db; return (0); } +/* + * The following code preserves the recursive function dbuf_hold_impl() + * but moves the local variables AND function arguments to the heap to + * minimize the stack frame size. Enough space is initially allocated + * on the stack for 20 levels of recursion. + */ +int +dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, + void *tag, dmu_buf_impl_t **dbp) +{ + struct dbuf_hold_impl_data *dh; + int error; + + dh = kmem_zalloc(sizeof(struct dbuf_hold_impl_data) * + DBUF_HOLD_IMPL_MAX_DEPTH, KM_SLEEP); + __dbuf_hold_impl_init(dh, dn, level, blkid, fail_sparse, tag, dbp, 0); + + error = __dbuf_hold_impl(dh); + + kmem_free(dh, sizeof(struct dbuf_hold_impl_data) * + DBUF_HOLD_IMPL_MAX_DEPTH); + + return (error); +} + +static void +__dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh, + dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, + void *tag, dmu_buf_impl_t **dbp, int depth) +{ + dh->dh_dn = dn; + dh->dh_level = level; + dh->dh_blkid = blkid; + dh->dh_fail_sparse = fail_sparse; + dh->dh_tag = tag; + dh->dh_dbp = dbp; + dh->dh_depth = depth; +} + dmu_buf_impl_t * dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) { diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 0eb1b1f782..c72c74fad6 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -789,18 +789,21 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_t *zb, { dsl_pool_t *dp = scn->scn_dp; arc_buf_t *buf = NULL; - blkptr_t bp_toread = *bp; + blkptr_t *bp_toread; + + bp_toread = kmem_alloc(sizeof (blkptr_t), KM_SLEEP); + *bp_toread = *bp; /* ASSERT(pbuf == NULL || arc_released(pbuf)); */ if (dsl_scan_check_pause(scn, zb)) - return; + goto out; if (dsl_scan_check_resume(scn, dnp, zb)) - return; + goto out; if (bp->blk_birth == 0) - return; + goto out; scn->scn_visited_this_txg++; @@ -811,7 +814,7 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_t *zb, pbuf, bp); if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) - return; + goto out; if (BP_GET_TYPE(bp) != DMU_OT_USERGROUP_USED) { /* @@ -826,12 +829,12 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_t *zb, * it (original untranslated -> translations from * deleted snap -> now). */ - bp_toread = *bp; + *bp_toread = *bp; } - if (dsl_scan_recurse(scn, ds, ostype, dnp, &bp_toread, zb, tx, + if (dsl_scan_recurse(scn, ds, ostype, dnp, bp_toread, zb, tx, &buf) != 0) - return; + goto out; /* * If dsl_scan_ddt() has aready visited this block, it will have @@ -841,7 +844,7 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_t *zb, if (ddt_class_contains(dp->dp_spa, scn->scn_phys.scn_ddt_class_max, bp)) { ASSERT(buf == NULL); - return; + goto out; } /* @@ -856,6 +859,8 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_t *zb, } if (buf) (void) arc_buf_remove_ref(buf, &buf); +out: + kmem_free(bp_toread, sizeof(blkptr_t)); } static void diff --git a/module/zfs/vdev_cache.c b/module/zfs/vdev_cache.c index a4b34423a2..c5439bacc4 100644 --- a/module/zfs/vdev_cache.c +++ b/module/zfs/vdev_cache.c @@ -244,7 +244,7 @@ int vdev_cache_read(zio_t *zio) { vdev_cache_t *vc = &zio->io_vd->vdev_cache; - vdev_cache_entry_t *ve, ve_search; + vdev_cache_entry_t *ve, *ve_search; uint64_t cache_offset = P2ALIGN(zio->io_offset, VCBS); ASSERTV(uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS);) zio_t *fio; @@ -267,8 +267,10 @@ vdev_cache_read(zio_t *zio) mutex_enter(&vc->vc_lock); - ve_search.ve_offset = cache_offset; - ve = avl_find(&vc->vc_offset_tree, &ve_search, NULL); + ve_search = kmem_alloc(sizeof(vdev_cache_entry_t), KM_SLEEP); + ve_search->ve_offset = cache_offset; + ve = avl_find(&vc->vc_offset_tree, ve_search, NULL); + kmem_free(ve_search, sizeof(vdev_cache_entry_t)); if (ve != NULL) { if (ve->ve_missed_update) { diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 201e9ccce0..46588922dc 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -2644,11 +2644,6 @@ zio_ready(zio_t *zio) static int zio_done(zio_t *zio) { - spa_t *spa = zio->io_spa; - zio_t *lio = zio->io_logical; - blkptr_t *bp = zio->io_bp; - vdev_t *vd = zio->io_vd; - uint64_t psize = zio->io_size; zio_t *pio, *pio_next; int c, w; @@ -2666,18 +2661,18 @@ zio_done(zio_t *zio) for (w = 0; w < ZIO_WAIT_TYPES; w++) ASSERT(zio->io_children[c][w] == 0); - if (bp != NULL) { - ASSERT(bp->blk_pad[0] == 0); - ASSERT(bp->blk_pad[1] == 0); - ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || - (bp == zio_unique_parent(zio)->io_bp)); - if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && + if (zio->io_bp != NULL) { + ASSERT(zio->io_bp->blk_pad[0] == 0); + ASSERT(zio->io_bp->blk_pad[1] == 0); + ASSERT(bcmp(zio->io_bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || + (zio->io_bp == zio_unique_parent(zio)->io_bp)); + if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(zio->io_bp) && zio->io_bp_override == NULL && !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { - ASSERT(!BP_SHOULD_BYTESWAP(bp)); - ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp)); - ASSERT(BP_COUNT_GANG(bp) == 0 || - (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); + ASSERT(!BP_SHOULD_BYTESWAP(zio->io_bp)); + ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); + ASSERT(BP_COUNT_GANG(zio->io_bp) == 0 || + (BP_COUNT_GANG(zio->io_bp) == BP_GET_NDVAS(zio->io_bp))); } } @@ -2696,13 +2691,13 @@ zio_done(zio_t *zio) while (zio->io_cksum_report != NULL) { zio_cksum_report_t *zcr = zio->io_cksum_report; uint64_t align = zcr->zcr_align; - uint64_t asize = P2ROUNDUP(psize, align); + uint64_t asize = P2ROUNDUP(zio->io_size, align); char *abuf = zio->io_data; - if (asize != psize) { + if (asize != zio->io_size) { abuf = zio_buf_alloc(asize); - bcopy(zio->io_data, abuf, psize); - bzero(abuf + psize, asize - psize); + bcopy(zio->io_data, abuf, zio->io_size); + bzero(abuf + zio->io_size, asize - zio->io_size); } zio->io_cksum_report = zcr->zcr_next; @@ -2710,14 +2705,14 @@ zio_done(zio_t *zio) zcr->zcr_finish(zcr, abuf); zfs_ereport_free_checksum(zcr); - if (asize != psize) + if (asize != zio->io_size) zio_buf_free(abuf, asize); } } zio_pop_transforms(zio); /* note: may set zio->io_error */ - vdev_stat_update(zio, psize); + vdev_stat_update(zio, zio->io_size); if (zio->io_error) { /* @@ -2726,28 +2721,30 @@ zio_done(zio_t *zio) * at the block level. We ignore these errors if the * device is currently unavailable. */ - if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) - zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0); + if (zio->io_error != ECKSUM && zio->io_vd != NULL && + !vdev_is_dead(zio->io_vd)) + zfs_ereport_post(FM_EREPORT_ZFS_IO, zio->io_spa, + zio->io_vd, zio, 0, 0); if ((zio->io_error == EIO || !(zio->io_flags & (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && - zio == lio) { + zio == zio->io_logical) { /* * For logical I/O requests, tell the SPA to log the * error and generate a logical data ereport. */ - spa_log_error(spa, zio); - zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio, + spa_log_error(zio->io_spa, zio); + zfs_ereport_post(FM_EREPORT_ZFS_DATA, zio->io_spa, NULL, zio, 0, 0); } } - if (zio->io_error && zio == lio) { + if (zio->io_error && zio == zio->io_logical) { /* * Determine whether zio should be reexecuted. This will * propagate all the way to the root via zio_notify_parent(). */ - ASSERT(vd == NULL && bp != NULL); + ASSERT(zio->io_vd == NULL && zio->io_bp != NULL); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); if (IO_IS_ALLOCATING(zio) && @@ -2761,8 +2758,8 @@ zio_done(zio_t *zio) if ((zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_FREE) && zio->io_error == ENXIO && - spa_load_state(spa) == SPA_LOAD_NONE && - spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) + spa_load_state(zio->io_spa) == SPA_LOAD_NONE && + spa_get_failmode(zio->io_spa) != ZIO_FAILURE_MODE_CONTINUE) zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) @@ -2788,7 +2785,7 @@ zio_done(zio_t *zio) if ((zio->io_error || zio->io_reexecute) && IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && !(zio->io_flags & ZIO_FLAG_IO_REWRITE)) - zio_dva_unallocate(zio, zio->io_gang_tree, bp); + zio_dva_unallocate(zio, zio->io_gang_tree, zio->io_bp); zio_gang_tree_free(&zio->io_gang_tree); @@ -2853,14 +2850,14 @@ zio_done(zio_t *zio) * We'd fail again if we reexecuted now, so suspend * until conditions improve (e.g. device comes online). */ - zio_suspend(spa, zio); + zio_suspend(zio->io_spa, zio); } else { /* * Reexecution is potentially a huge amount of work. * Hand it off to the otherwise-unused claim taskq. */ (void) taskq_dispatch( - spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE], + zio->io_spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE], (task_func_t *)zio_reexecute, zio, TQ_SLEEP); } return (ZIO_PIPELINE_STOP);