OpenZFS 7793 - ztest fails assertion in dmu_tx_willuse_space

Reviewed by: Steve Gonczi <steve.gonczi@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Ported-by: Brian Behlendorf <behlendorf1@llnl.gov>

Background information: This assertion about tx_space_* verifies that we
are not dirtying more stuff than we thought we would. We “need” to know
how much we will dirty so that we can check if we should fail this
transaction with ENOSPC/EDQUOT, in dmu_tx_assign(). While the
transaction is open (i.e. between dmu_tx_assign() and dmu_tx_commit() —
typically less than a millisecond), we call dbuf_dirty() on the exact
blocks that will be modified. Once this happens, the temporary
accounting in tx_space_* is unnecessary, because we know exactly what
blocks are newly dirtied; we call dnode_willuse_space() to track this
more exact accounting.

The fundamental problem causing this bug is that dmu_tx_hold_*() relies
on the current state in the DMU (e.g. dn_nlevels) to predict how much
will be dirtied by this transaction, but this state can change before we
actually perform the transaction (i.e. call dbuf_dirty()).

This bug will be fixed by removing the assertion that the tx_space_*
accounting is perfectly accurate (i.e. we never dirty more than was
predicted by dmu_tx_hold_*()). By removing the requirement that this
accounting be perfectly accurate, we can also vastly simplify it, e.g.
removing most of the logic in dmu_tx_count_*().

The new tx space accounting will be very approximate, and may be more or
less than what is actually dirtied. It will still be used to determine
if this transaction will put us over quota. Transactions that are marked
by dmu_tx_mark_netfree() will be excepted from this check. We won’t make
an attempt to determine how much space will be freed by the transaction
— this was rarely accurate enough to determine if a transaction should
be permitted when we are over quota, which is why dmu_tx_mark_netfree()
was introduced in 2014.

We also won’t attempt to give “credit” when overwriting existing blocks,
if those blocks may be freed. This allows us to remove the
do_free_accounting logic in dbuf_dirty(), and associated routines. This
logic attempted to predict what will be on disk when this txg syncs, to
know if the overwritten block will be freed (i.e. exists, and has no
snapshots).

OpenZFS-issue: https://www.illumos.org/issues/7793
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/3704e0a
Upstream bugs: DLPX-32883a
Closes #5804 

Porting notes:
- DNODE_SIZE replaced with DNODE_MIN_SIZE in dmu_tx_count_dnode(),
  Using the default dnode size would be slightly better.
- DEBUG_DMU_TX wrappers and configure option removed.
- Resolved _by_dnode() conflicts these changes have not yet been
  applied to OpenZFS.
This commit is contained in:
Brian Behlendorf 2017-03-07 09:51:59 -08:00 committed by GitHub
parent e2fcb56275
commit 3ec3bc2167
22 changed files with 245 additions and 1072 deletions

View File

@ -37,29 +37,6 @@ AC_DEFUN([ZFS_AC_DEBUG], [
AC_MSG_RESULT([$enable_debug]) AC_MSG_RESULT([$enable_debug])
]) ])
AC_DEFUN([ZFS_AC_DEBUG_DMU_TX], [
AC_ARG_ENABLE([debug-dmu-tx],
[AS_HELP_STRING([--enable-debug-dmu-tx],
[Enable dmu tx validation @<:@default=no@:>@])],
[],
[enable_debug_dmu_tx=no])
AS_IF([test "x$enable_debug_dmu_tx" = xyes],
[
KERNELCPPFLAGS="${KERNELCPPFLAGS} -DDEBUG_DMU_TX"
DEBUG_DMU_TX="_with_debug_dmu_tx"
AC_DEFINE([DEBUG_DMU_TX], [1],
[Define to 1 to enabled dmu tx validation])
],
[
DEBUG_DMU_TX="_without_debug_dmu_tx"
])
AC_SUBST(DEBUG_DMU_TX)
AC_MSG_CHECKING([whether dmu tx validation is enabled])
AC_MSG_RESULT([$enable_debug_dmu_tx])
])
AC_DEFUN([ZFS_AC_CONFIG_ALWAYS], [ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS], [
ZFS_AC_CONFIG_ALWAYS_NO_UNUSED_BUT_SET_VARIABLE ZFS_AC_CONFIG_ALWAYS_NO_UNUSED_BUT_SET_VARIABLE
ZFS_AC_CONFIG_ALWAYS_NO_BOOL_COMPARE ZFS_AC_CONFIG_ALWAYS_NO_BOOL_COMPARE
@ -140,7 +117,7 @@ AC_DEFUN([ZFS_AC_RPM], [
AC_MSG_RESULT([$HAVE_RPMBUILD]) AC_MSG_RESULT([$HAVE_RPMBUILD])
]) ])
RPM_DEFINE_COMMON='--define "$(DEBUG_ZFS) 1" --define "$(DEBUG_DMU_TX) 1"' RPM_DEFINE_COMMON='--define "$(DEBUG_ZFS) 1"'
RPM_DEFINE_UTIL='--define "_dracutdir $(dracutdir)" --define "_udevdir $(udevdir)" --define "_udevruledir $(udevruledir)" --define "_initconfdir $(DEFAULT_INITCONF_DIR)" $(DEFINE_INITRAMFS)' RPM_DEFINE_UTIL='--define "_dracutdir $(dracutdir)" --define "_udevdir $(udevdir)" --define "_udevruledir $(udevruledir)" --define "_initconfdir $(DEFAULT_INITCONF_DIR)" $(DEFINE_INITRAMFS)'
RPM_DEFINE_KMOD='--define "kernels $(LINUX_VERSION)" --define "require_spldir $(SPL)" --define "require_splobj $(SPL_OBJ)" --define "ksrc $(LINUX)" --define "kobj $(LINUX_OBJ)"' RPM_DEFINE_KMOD='--define "kernels $(LINUX_VERSION)" --define "require_spldir $(SPL)" --define "require_splobj $(SPL_OBJ)" --define "ksrc $(LINUX)" --define "kobj $(LINUX_OBJ)"'
RPM_DEFINE_DKMS= RPM_DEFINE_DKMS=

View File

@ -55,7 +55,6 @@ ZFS_AC_LICENSE
ZFS_AC_PACKAGE ZFS_AC_PACKAGE
ZFS_AC_CONFIG ZFS_AC_CONFIG
ZFS_AC_DEBUG ZFS_AC_DEBUG
ZFS_AC_DEBUG_DMU_TX
AC_CONFIG_FILES([ AC_CONFIG_FILES([
Makefile Makefile

View File

@ -91,10 +91,6 @@ MOUNT_EXTRA_OPTIONS=""
# Only applicable for Debian GNU/Linux {dkms,initramfs}. # Only applicable for Debian GNU/Linux {dkms,initramfs}.
ZFS_DKMS_ENABLE_DEBUG='no' ZFS_DKMS_ENABLE_DEBUG='no'
# Build kernel modules with the --enable-debug-dmu-tx switch?
# Only applicable for Debian GNU/Linux {dkms,initramfs}.
ZFS_DKMS_ENABLE_DEBUG_DMU_TX='no'
# Keep debugging symbols in kernel modules? # Keep debugging symbols in kernel modules?
# Only applicable for Debian GNU/Linux {dkms,initramfs}. # Only applicable for Debian GNU/Linux {dkms,initramfs}.
ZFS_DKMS_DISABLE_STRIP='no' ZFS_DKMS_DISABLE_STRIP='no'

View File

@ -656,11 +656,6 @@ struct blkptr *dmu_buf_get_blkptr(dmu_buf_t *db);
*/ */
void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx);
/*
* Tells if the given dbuf is freeable.
*/
boolean_t dmu_buf_freeable(dmu_buf_t *);
/* /*
* You must create a transaction, then hold the objects which you will * You must create a transaction, then hold the objects which you will
* (or might) modify as part of this transaction. Then you must assign * (or might) modify as part of this transaction. Then you must assign

View File

@ -86,7 +86,6 @@ extern "C" {
* held from: * held from:
* callers of dbuf_read_impl, dbuf_hold[_impl], dbuf_prefetch * callers of dbuf_read_impl, dbuf_hold[_impl], dbuf_prefetch
* dmu_object_info_from_dnode: dn_dirty_mtx (dn_datablksz) * dmu_object_info_from_dnode: dn_dirty_mtx (dn_datablksz)
* dmu_tx_count_free:
* dbuf_read_impl: db_mtx, dmu_zfetch() * dbuf_read_impl: db_mtx, dmu_zfetch()
* dmu_zfetch: zf_rwlock/r, zst_lock, dbuf_prefetch() * dmu_zfetch: zf_rwlock/r, zst_lock, dbuf_prefetch()
* dbuf_new_size: db_mtx * dbuf_new_size: db_mtx
@ -197,7 +196,6 @@ extern "C" {
* dsl_prop_changed_notify: none (dd_prop_cbs) * dsl_prop_changed_notify: none (dd_prop_cbs)
* dsl_prop_register: none (dd_prop_cbs) * dsl_prop_register: none (dd_prop_cbs)
* dsl_prop_unregister: none (dd_prop_cbs) * dsl_prop_unregister: none (dd_prop_cbs)
* dsl_dataset_block_freeable: none (dd_sync_*)
* *
* os_lock (leaf) * os_lock (leaf)
* protects: * protects:

View File

@ -20,7 +20,7 @@
*/ */
/* /*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/ */
@ -196,6 +196,7 @@ boolean_t dmu_objset_userobjspace_present(objset_t *os);
int dmu_fsname(const char *snapname, char *buf); int dmu_fsname(const char *snapname, char *buf);
void dmu_objset_evict_done(objset_t *os); void dmu_objset_evict_done(objset_t *os);
void dmu_objset_willuse_space(objset_t *os, int64_t space, dmu_tx_t *tx);
void dmu_objset_init(void); void dmu_objset_init(void);
void dmu_objset_fini(void); void dmu_objset_fini(void);

View File

@ -23,7 +23,7 @@
* Use is subject to license terms. * Use is subject to license terms.
*/ */
/* /*
* Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
*/ */
#ifndef _SYS_DMU_TX_H #ifndef _SYS_DMU_TX_H
@ -70,6 +70,9 @@ struct dmu_tx {
/* has this transaction already been delayed? */ /* has this transaction already been delayed? */
boolean_t tx_waited; boolean_t tx_waited;
/* transaction is marked as being a "net free" of space */
boolean_t tx_netfree;
/* time this transaction was created */ /* time this transaction was created */
hrtime_t tx_start; hrtime_t tx_start;
@ -77,14 +80,6 @@ struct dmu_tx {
boolean_t tx_wait_dirty; boolean_t tx_wait_dirty;
int tx_err; int tx_err;
#ifdef DEBUG_DMU_TX
uint64_t tx_space_towrite;
uint64_t tx_space_tofree;
uint64_t tx_space_tooverwrite;
uint64_t tx_space_tounref;
refcount_t tx_space_written;
refcount_t tx_space_freed;
#endif
}; };
enum dmu_tx_hold_type { enum dmu_tx_hold_type {
@ -103,16 +98,10 @@ typedef struct dmu_tx_hold {
list_node_t txh_node; list_node_t txh_node;
struct dnode *txh_dnode; struct dnode *txh_dnode;
refcount_t txh_space_towrite; refcount_t txh_space_towrite;
refcount_t txh_space_tofree;
refcount_t txh_space_tooverwrite;
refcount_t txh_space_tounref;
refcount_t txh_memory_tohold; refcount_t txh_memory_tohold;
refcount_t txh_fudge;
#ifdef DEBUG_DMU_TX
enum dmu_tx_hold_type txh_type; enum dmu_tx_hold_type txh_type;
uint64_t txh_arg1; uint64_t txh_arg1;
uint64_t txh_arg2; uint64_t txh_arg2;
#endif
} dmu_tx_hold_t; } dmu_tx_hold_t;
typedef struct dmu_tx_callback { typedef struct dmu_tx_callback {
@ -172,12 +161,10 @@ dmu_tx_t *dmu_tx_create_dd(dsl_dir_t *dd);
int dmu_tx_is_syncing(dmu_tx_t *tx); int dmu_tx_is_syncing(dmu_tx_t *tx);
int dmu_tx_private_ok(dmu_tx_t *tx); int dmu_tx_private_ok(dmu_tx_t *tx);
void dmu_tx_add_new_object(dmu_tx_t *tx, dnode_t *dn); void dmu_tx_add_new_object(dmu_tx_t *tx, dnode_t *dn);
void dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta);
void dmu_tx_dirty_buf(dmu_tx_t *tx, struct dmu_buf_impl *db); void dmu_tx_dirty_buf(dmu_tx_t *tx, struct dmu_buf_impl *db);
int dmu_tx_holds(dmu_tx_t *tx, uint64_t object);
void dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space); void dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space);
#ifdef DEBUG_DMU_TX #ifdef ZFS_DEBUG
#define DMU_TX_DIRTY_BUF(tx, db) dmu_tx_dirty_buf(tx, db) #define DMU_TX_DIRTY_BUF(tx, db) dmu_tx_dirty_buf(tx, db)
#else #else
#define DMU_TX_DIRTY_BUF(tx, db) #define DMU_TX_DIRTY_BUF(tx, db)

View File

@ -344,7 +344,6 @@ void dnode_verify(dnode_t *dn);
int dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx); int dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx);
void dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx); void dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx);
void dnode_diduse_space(dnode_t *dn, int64_t space); void dnode_diduse_space(dnode_t *dn, int64_t space);
void dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx);
void dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t); void dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t);
uint64_t dnode_block_freed(dnode_t *dn, uint64_t blkid); uint64_t dnode_block_freed(dnode_t *dn, uint64_t blkid);
void dnode_init(void); void dnode_init(void);

View File

@ -286,9 +286,6 @@ void dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp,
dmu_tx_t *tx); dmu_tx_t *tx);
int dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, int dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp,
dmu_tx_t *tx, boolean_t async); dmu_tx_t *tx, boolean_t async);
boolean_t dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp,
uint64_t blk_birth);
uint64_t dsl_dataset_prev_snap_txg(dsl_dataset_t *ds);
int dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, int dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name,
uint64_t *value); uint64_t *value);

View File

@ -20,7 +20,7 @@
*/ */
/* /*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved. * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
* Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright (c) 2014, Joyent, Inc. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/ */
@ -137,8 +137,7 @@ uint64_t dsl_dir_space_available(dsl_dir_t *dd,
void dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx); void dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx);
void dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx); void dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx);
int dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t mem, int dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t mem,
uint64_t asize, uint64_t fsize, uint64_t usize, void **tr_cookiep, uint64_t asize, boolean_t netfree, void **tr_cookiep, dmu_tx_t *tx);
dmu_tx_t *tx);
void dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx); void dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx);
void dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx); void dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx);
void dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type, void dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,

View File

@ -795,7 +795,7 @@ extern uint64_t spa_version(spa_t *spa);
extern pool_state_t spa_state(spa_t *spa); extern pool_state_t spa_state(spa_t *spa);
extern spa_load_state_t spa_load_state(spa_t *spa); extern spa_load_state_t spa_load_state(spa_t *spa);
extern uint64_t spa_freeze_txg(spa_t *spa); extern uint64_t spa_freeze_txg(spa_t *spa);
extern uint64_t spa_get_asize(spa_t *spa, uint64_t lsize); extern uint64_t spa_get_worst_case_asize(spa_t *spa, uint64_t lsize);
extern uint64_t spa_get_dspace(spa_t *spa); extern uint64_t spa_get_dspace(spa_t *spa);
extern uint64_t spa_get_slop_space(spa_t *spa); extern uint64_t spa_get_slop_space(spa_t *spa);
extern void spa_update_dspace(spa_t *spa); extern void spa_update_dspace(spa_t *spa);

View File

@ -54,14 +54,6 @@ DECLARE_EVENT_CLASS(zfs_delay_mintime_class,
__field(hrtime_t, tx_start) __field(hrtime_t, tx_start)
__field(boolean_t, tx_wait_dirty) __field(boolean_t, tx_wait_dirty)
__field(int, tx_err) __field(int, tx_err)
#ifdef DEBUG_DMU_TX
__field(uint64_t, tx_space_towrite)
__field(uint64_t, tx_space_tofree)
__field(uint64_t, tx_space_tooverwrite)
__field(uint64_t, tx_space_tounref)
__field(int64_t, tx_space_written)
__field(int64_t, tx_space_freed)
#endif
__field(uint64_t, min_tx_time) __field(uint64_t, min_tx_time)
__field(uint64_t, dirty) __field(uint64_t, dirty)
), ),
@ -74,32 +66,15 @@ DECLARE_EVENT_CLASS(zfs_delay_mintime_class,
__entry->tx_start = tx->tx_start; __entry->tx_start = tx->tx_start;
__entry->tx_wait_dirty = tx->tx_wait_dirty; __entry->tx_wait_dirty = tx->tx_wait_dirty;
__entry->tx_err = tx->tx_err; __entry->tx_err = tx->tx_err;
#ifdef DEBUG_DMU_TX
__entry->tx_space_towrite = tx->tx_space_towrite;
__entry->tx_space_tofree = tx->tx_space_tofree;
__entry->tx_space_tooverwrite = tx->tx_space_tooverwrite;
__entry->tx_space_tounref = tx->tx_space_tounref;
__entry->tx_space_written = tx->tx_space_written.rc_count;
__entry->tx_space_freed = tx->tx_space_freed.rc_count;
#endif
__entry->dirty = dirty; __entry->dirty = dirty;
__entry->min_tx_time = min_tx_time; __entry->min_tx_time = min_tx_time;
), ),
TP_printk("tx { txg %llu lastsnap_txg %llu tx_lasttried_txg %llu " TP_printk("tx { txg %llu lastsnap_txg %llu tx_lasttried_txg %llu "
"anyobj %d waited %d start %llu wait_dirty %d err %i " "anyobj %d waited %d start %llu wait_dirty %d err %i "
#ifdef DEBUG_DMU_TX
"space_towrite %llu space_tofree %llu space_tooverwrite %llu "
"space_tounref %llu space_written %lli space_freed %lli "
#endif
"} dirty %llu min_tx_time %llu", "} dirty %llu min_tx_time %llu",
__entry->tx_txg, __entry->tx_lastsnap_txg, __entry->tx_txg, __entry->tx_lastsnap_txg,
__entry->tx_lasttried_txg, __entry->tx_anyobj, __entry->tx_waited, __entry->tx_lasttried_txg, __entry->tx_anyobj, __entry->tx_waited,
__entry->tx_start, __entry->tx_wait_dirty, __entry->tx_err, __entry->tx_start, __entry->tx_wait_dirty, __entry->tx_err,
#ifdef DEBUG_DMU_TX
__entry->tx_space_towrite, __entry->tx_space_tofree,
__entry->tx_space_tooverwrite, __entry->tx_space_tounref,
__entry->tx_space_written, __entry->tx_space_freed,
#endif
__entry->dirty, __entry->min_tx_time) __entry->dirty, __entry->min_tx_time)
); );
/* END CSTYLED */ /* END CSTYLED */

View File

@ -216,8 +216,6 @@ int fzap_lookup(zap_name_t *zn,
uint64_t integer_size, uint64_t num_integers, void *buf, uint64_t integer_size, uint64_t num_integers, void *buf,
char *realname, int rn_len, boolean_t *normalization_conflictp); char *realname, int rn_len, boolean_t *normalization_conflictp);
void fzap_prefetch(zap_name_t *zn); void fzap_prefetch(zap_name_t *zn);
int fzap_count_write(zap_name_t *zn, int add, refcount_t *towrite,
refcount_t *tooverwrite);
int fzap_add(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, int fzap_add(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers,
const void *val, void *tag, dmu_tx_t *tx); const void *val, void *tag, dmu_tx_t *tx);
int fzap_update(zap_name_t *zn, int fzap_update(zap_name_t *zn,

View File

@ -1432,41 +1432,6 @@ dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
mutex_exit(&dn->dn_dbufs_mtx); mutex_exit(&dn->dn_dbufs_mtx);
} }
static int
dbuf_block_freeable(dmu_buf_impl_t *db)
{
dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
uint64_t birth_txg = 0;
/*
* We don't need any locking to protect db_blkptr:
* If it's syncing, then db_last_dirty will be set
* so we'll ignore db_blkptr.
*
* This logic ensures that only block births for
* filled blocks are considered.
*/
ASSERT(MUTEX_HELD(&db->db_mtx));
if (db->db_last_dirty && (db->db_blkptr == NULL ||
!BP_IS_HOLE(db->db_blkptr))) {
birth_txg = db->db_last_dirty->dr_txg;
} else if (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) {
birth_txg = db->db_blkptr->blk_birth;
}
/*
* If this block don't exist or is in a snapshot, it can't be freed.
* Don't pass the bp to dsl_dataset_block_freeable() since we
* are holding the db_mtx lock and might deadlock if we are
* prefetching a dedup-ed block.
*/
if (birth_txg != 0)
return (ds == NULL ||
dsl_dataset_block_freeable(ds, NULL, birth_txg));
else
return (B_FALSE);
}
void void
dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
{ {
@ -1516,7 +1481,7 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
} }
mutex_exit(&db->db_mtx); mutex_exit(&db->db_mtx);
dnode_willuse_space(dn, size-osize, tx); dmu_objset_willuse_space(dn->dn_objset, size - osize, tx);
DB_DNODE_EXIT(db); DB_DNODE_EXIT(db);
} }
@ -1566,7 +1531,6 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
objset_t *os; objset_t *os;
dbuf_dirty_record_t **drp, *dr; dbuf_dirty_record_t **drp, *dr;
int drop_struct_lock = FALSE; int drop_struct_lock = FALSE;
boolean_t do_free_accounting = B_FALSE;
int txgoff = tx->tx_txg & TXG_MASK; int txgoff = tx->tx_txg & TXG_MASK;
ASSERT(tx->tx_txg != 0); ASSERT(tx->tx_txg != 0);
@ -1688,15 +1652,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
if (db->db_blkid != DMU_BONUS_BLKID) { if (db->db_blkid != DMU_BONUS_BLKID) {
/* dmu_objset_willuse_space(os, db->db.db_size, tx);
* Update the accounting.
* Note: we delay "free accounting" until after we drop
* the db_mtx. This keeps us from grabbing other locks
* (and possibly deadlocking) in bp_get_dsize() while
* also holding the db_mtx.
*/
dnode_willuse_space(dn, db->db.db_size, tx);
do_free_accounting = dbuf_block_freeable(db);
} }
/* /*
@ -1790,21 +1746,13 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
drop_struct_lock = TRUE; drop_struct_lock = TRUE;
} }
if (do_free_accounting) {
blkptr_t *bp = db->db_blkptr;
int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
bp_get_dsize(os->os_spa, bp) : db->db.db_size;
/* /*
* This is only a guess -- if the dbuf is dirty * If we are overwriting a dedup BP, then unless it is snapshotted,
* in a previous txg, we don't know how much * when we get to syncing context we will need to decrement its
* space it will use on disk yet. We should * refcount in the DDT. Prefetch the relevant DDT block so that
* really have the struct_rwlock to access * syncing context won't have to wait for the i/o.
* db_blkptr, but since this is just a guess,
* it's OK if we get an odd answer.
*/ */
ddt_prefetch(os->os_spa, bp); ddt_prefetch(os->os_spa, db->db_blkptr);
dnode_willuse_space(dn, -willfree, tx);
}
if (db->db_level == 0) { if (db->db_level == 0) {
dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock); dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock);
@ -3092,19 +3040,6 @@ dmu_buf_user_evict_wait()
taskq_wait(dbu_evict_taskq); taskq_wait(dbu_evict_taskq);
} }
boolean_t
dmu_buf_freeable(dmu_buf_t *dbuf)
{
boolean_t res = B_FALSE;
dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
if (db->db_blkptr)
res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset,
db->db_blkptr, db->db_blkptr->blk_birth);
return (res);
}
blkptr_t * blkptr_t *
dmu_buf_get_blkptr(dmu_buf_t *db) dmu_buf_get_blkptr(dmu_buf_t *db)
{ {
@ -3891,7 +3826,6 @@ EXPORT_SYMBOL(dbuf_sync_list);
EXPORT_SYMBOL(dmu_buf_set_user); EXPORT_SYMBOL(dmu_buf_set_user);
EXPORT_SYMBOL(dmu_buf_set_user_ie); EXPORT_SYMBOL(dmu_buf_set_user_ie);
EXPORT_SYMBOL(dmu_buf_get_user); EXPORT_SYMBOL(dmu_buf_get_user);
EXPORT_SYMBOL(dmu_buf_freeable);
EXPORT_SYMBOL(dmu_buf_get_blkptr); EXPORT_SYMBOL(dmu_buf_get_blkptr);
/* BEGIN CSTYLED */ /* BEGIN CSTYLED */

View File

@ -2344,6 +2344,23 @@ dmu_fsname(const char *snapname, char *buf)
return (0); return (0);
} }
/*
* Call when we think we're going to write/free space in open context to track
* the amount of dirty data in the open txg, which is also the amount
* of memory that can not be evicted until this txg syncs.
*/
void
dmu_objset_willuse_space(objset_t *os, int64_t space, dmu_tx_t *tx)
{
dsl_dataset_t *ds = os->os_dsl_dataset;
int64_t aspace = spa_get_worst_case_asize(os->os_spa, space);
if (ds != NULL) {
dsl_dir_willuse_space(ds->ds_dir, aspace, tx);
dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx);
}
}
#if defined(_KERNEL) && defined(HAVE_SPL) #if defined(_KERNEL) && defined(HAVE_SPL)
EXPORT_SYMBOL(dmu_objset_zil); EXPORT_SYMBOL(dmu_objset_zil);
EXPORT_SYMBOL(dmu_objset_pool); EXPORT_SYMBOL(dmu_objset_pool);

File diff suppressed because it is too large Load Diff

View File

@ -1948,25 +1948,6 @@ dnode_diduse_space(dnode_t *dn, int64_t delta)
mutex_exit(&dn->dn_mtx); mutex_exit(&dn->dn_mtx);
} }
/*
* Call when we think we're going to write/free space in open context to track
* the amount of memory in use by the currently open txg.
*/
void
dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx)
{
objset_t *os = dn->dn_objset;
dsl_dataset_t *ds = os->os_dsl_dataset;
int64_t aspace = spa_get_asize(os->os_spa, space);
if (ds != NULL) {
dsl_dir_willuse_space(ds->ds_dir, aspace, tx);
dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx);
}
dmu_tx_willuse_space(tx, aspace);
}
/* /*
* Scans a block at the indicated "level" looking for a hole or data, * Scans a block at the indicated "level" looking for a hole or data,
* depending on 'flags'. * depending on 'flags'.

View File

@ -242,42 +242,6 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
return (used); return (used);
} }
uint64_t
dsl_dataset_prev_snap_txg(dsl_dataset_t *ds)
{
uint64_t trysnap = 0;
if (ds == NULL)
return (0);
/*
* The snapshot creation could fail, but that would cause an
* incorrect FALSE return, which would only result in an
* overestimation of the amount of space that an operation would
* consume, which is OK.
*
* There's also a small window where we could miss a pending
* snapshot, because we could set the sync task in the quiescing
* phase. So this should only be used as a guess.
*/
if (ds->ds_trysnap_txg >
spa_last_synced_txg(ds->ds_dir->dd_pool->dp_spa))
trysnap = ds->ds_trysnap_txg;
return (MAX(dsl_dataset_phys(ds)->ds_prev_snap_txg, trysnap));
}
boolean_t
dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp,
uint64_t blk_birth)
{
if (blk_birth <= dsl_dataset_prev_snap_txg(ds) ||
(bp != NULL && BP_IS_HOLE(bp)))
return (B_FALSE);
ddt_prefetch(dsl_dataset_get_spa(ds), bp);
return (B_TRUE);
}
/* /*
* We have to release the fsid syncronously or we risk that a subsequent * We have to release the fsid syncronously or we risk that a subsequent
* mount of the same dataset will fail to unique_insert the fsid. This * mount of the same dataset will fail to unique_insert the fsid. This
@ -3731,8 +3695,6 @@ EXPORT_SYMBOL(dsl_dataset_space_wouldfree);
EXPORT_SYMBOL(dsl_dataset_sync); EXPORT_SYMBOL(dsl_dataset_sync);
EXPORT_SYMBOL(dsl_dataset_block_born); EXPORT_SYMBOL(dsl_dataset_block_born);
EXPORT_SYMBOL(dsl_dataset_block_kill); EXPORT_SYMBOL(dsl_dataset_block_kill);
EXPORT_SYMBOL(dsl_dataset_block_freeable);
EXPORT_SYMBOL(dsl_dataset_prev_snap_txg);
EXPORT_SYMBOL(dsl_dataset_dirty); EXPORT_SYMBOL(dsl_dataset_dirty);
EXPORT_SYMBOL(dsl_dataset_stats); EXPORT_SYMBOL(dsl_dataset_stats);
EXPORT_SYMBOL(dsl_dataset_fast_stat); EXPORT_SYMBOL(dsl_dataset_fast_stat);

View File

@ -1031,13 +1031,12 @@ static uint64_t
dsl_dir_space_towrite(dsl_dir_t *dd) dsl_dir_space_towrite(dsl_dir_t *dd)
{ {
uint64_t space = 0; uint64_t space = 0;
int i;
ASSERT(MUTEX_HELD(&dd->dd_lock)); ASSERT(MUTEX_HELD(&dd->dd_lock));
for (i = 0; i < TXG_SIZE; i++) { for (int i = 0; i < TXG_SIZE; i++) {
space += dd->dd_space_towrite[i&TXG_MASK]; space += dd->dd_space_towrite[i & TXG_MASK];
ASSERT3U(dd->dd_space_towrite[i&TXG_MASK], >=, 0); ASSERT3U(dd->dd_space_towrite[i & TXG_MASK], >=, 0);
} }
return (space); return (space);
} }
@ -1117,16 +1116,13 @@ struct tempreserve {
static int static int
dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree, dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
boolean_t ignorequota, boolean_t checkrefquota, list_t *tr_list, boolean_t ignorequota, list_t *tr_list,
dmu_tx_t *tx, boolean_t first) dmu_tx_t *tx, boolean_t first)
{ {
uint64_t txg = tx->tx_txg; uint64_t txg = tx->tx_txg;
uint64_t est_inflight, used_on_disk, quota, parent_rsrv; uint64_t quota;
uint64_t deferred = 0;
struct tempreserve *tr; struct tempreserve *tr;
int retval = EDQUOT; int retval = EDQUOT;
int txgidx = txg & TXG_MASK;
int i;
uint64_t ref_rsrv = 0; uint64_t ref_rsrv = 0;
ASSERT3U(txg, !=, 0); ASSERT3U(txg, !=, 0);
@ -1138,10 +1134,10 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
* Check against the dsl_dir's quota. We don't add in the delta * Check against the dsl_dir's quota. We don't add in the delta
* when checking for over-quota because they get one free hit. * when checking for over-quota because they get one free hit.
*/ */
est_inflight = dsl_dir_space_towrite(dd); uint64_t est_inflight = dsl_dir_space_towrite(dd);
for (i = 0; i < TXG_SIZE; i++) for (int i = 0; i < TXG_SIZE; i++)
est_inflight += dd->dd_tempreserved[i]; est_inflight += dd->dd_tempreserved[i];
used_on_disk = dsl_dir_phys(dd)->dd_used_bytes; uint64_t used_on_disk = dsl_dir_phys(dd)->dd_used_bytes;
/* /*
* On the first iteration, fetch the dataset's used-on-disk and * On the first iteration, fetch the dataset's used-on-disk and
@ -1152,9 +1148,9 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
int error; int error;
dsl_dataset_t *ds = tx->tx_objset->os_dsl_dataset; dsl_dataset_t *ds = tx->tx_objset->os_dsl_dataset;
error = dsl_dataset_check_quota(ds, checkrefquota, error = dsl_dataset_check_quota(ds, !netfree,
asize, est_inflight, &used_on_disk, &ref_rsrv); asize, est_inflight, &used_on_disk, &ref_rsrv);
if (error) { if (error != 0) {
mutex_exit(&dd->dd_lock); mutex_exit(&dd->dd_lock);
DMU_TX_STAT_BUMP(dmu_tx_quota); DMU_TX_STAT_BUMP(dmu_tx_quota);
return (error); return (error);
@ -1180,6 +1176,7 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
* we're very close to full, this will allow a steady trickle of * we're very close to full, this will allow a steady trickle of
* removes to get through. * removes to get through.
*/ */
uint64_t deferred = 0;
if (dd->dd_parent == NULL) { if (dd->dd_parent == NULL) {
spa_t *spa = dd->dd_pool->dp_spa; spa_t *spa = dd->dd_pool->dp_spa;
uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree); uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree);
@ -1210,9 +1207,9 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
} }
/* We need to up our estimated delta before dropping dd_lock */ /* We need to up our estimated delta before dropping dd_lock */
dd->dd_tempreserved[txgidx] += asize; dd->dd_tempreserved[txg & TXG_MASK] += asize;
parent_rsrv = parent_delta(dd, used_on_disk + est_inflight, uint64_t parent_rsrv = parent_delta(dd, used_on_disk + est_inflight,
asize - ref_rsrv); asize - ref_rsrv);
mutex_exit(&dd->dd_lock); mutex_exit(&dd->dd_lock);
@ -1222,11 +1219,11 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
list_insert_tail(tr_list, tr); list_insert_tail(tr_list, tr);
/* see if it's OK with our parent */ /* see if it's OK with our parent */
if (dd->dd_parent && parent_rsrv) { if (dd->dd_parent != NULL && parent_rsrv != 0) {
boolean_t ismos = (dsl_dir_phys(dd)->dd_head_dataset_obj == 0); boolean_t ismos = (dsl_dir_phys(dd)->dd_head_dataset_obj == 0);
return (dsl_dir_tempreserve_impl(dd->dd_parent, return (dsl_dir_tempreserve_impl(dd->dd_parent,
parent_rsrv, netfree, ismos, TRUE, tr_list, tx, FALSE)); parent_rsrv, netfree, ismos, tr_list, tx, B_FALSE));
} else { } else {
return (0); return (0);
} }
@ -1240,7 +1237,7 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
*/ */
int int
dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize, dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize,
uint64_t fsize, uint64_t usize, void **tr_cookiep, dmu_tx_t *tx) boolean_t netfree, void **tr_cookiep, dmu_tx_t *tx)
{ {
int err; int err;
list_t *tr_list; list_t *tr_list;
@ -1254,7 +1251,6 @@ dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize,
list_create(tr_list, sizeof (struct tempreserve), list_create(tr_list, sizeof (struct tempreserve),
offsetof(struct tempreserve, tr_node)); offsetof(struct tempreserve, tr_node));
ASSERT3S(asize, >, 0); ASSERT3S(asize, >, 0);
ASSERT3S(fsize, >=, 0);
err = arc_tempreserve_space(lsize, tx->tx_txg); err = arc_tempreserve_space(lsize, tx->tx_txg);
if (err == 0) { if (err == 0) {
@ -1281,8 +1277,8 @@ dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize,
} }
if (err == 0) { if (err == 0) {
err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize, err = dsl_dir_tempreserve_impl(dd, asize, netfree,
FALSE, asize > usize, tr_list, tx, TRUE); B_FALSE, tr_list, tx, B_TRUE);
} }
if (err != 0) if (err != 0)

View File

@ -1615,7 +1615,7 @@ spa_freeze_txg(spa_t *spa)
/* ARGSUSED */ /* ARGSUSED */
uint64_t uint64_t
spa_get_asize(spa_t *spa, uint64_t lsize) spa_get_worst_case_asize(spa_t *spa, uint64_t lsize)
{ {
return (lsize * spa_asize_inflation); return (lsize * spa_asize_inflation);
} }
@ -2078,7 +2078,6 @@ EXPORT_SYMBOL(spa_version);
EXPORT_SYMBOL(spa_state); EXPORT_SYMBOL(spa_state);
EXPORT_SYMBOL(spa_load_state); EXPORT_SYMBOL(spa_load_state);
EXPORT_SYMBOL(spa_freeze_txg); EXPORT_SYMBOL(spa_freeze_txg);
EXPORT_SYMBOL(spa_get_asize);
EXPORT_SYMBOL(spa_get_dspace); EXPORT_SYMBOL(spa_get_dspace);
EXPORT_SYMBOL(spa_update_dspace); EXPORT_SYMBOL(spa_update_dspace);
EXPORT_SYMBOL(spa_deflate); EXPORT_SYMBOL(spa_deflate);

View File

@ -1357,64 +1357,3 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs)
} }
} }
} }
int
fzap_count_write(zap_name_t *zn, int add, refcount_t *towrite,
refcount_t *tooverwrite)
{
zap_t *zap = zn->zn_zap;
zap_leaf_t *l;
int err;
/*
* Account for the header block of the fatzap.
*/
if (!add && dmu_buf_freeable(zap->zap_dbuf)) {
(void) refcount_add_many(tooverwrite,
zap->zap_dbuf->db_size, FTAG);
} else {
(void) refcount_add_many(towrite,
zap->zap_dbuf->db_size, FTAG);
}
/*
* Account for the pointer table blocks.
* If we are adding we need to account for the following cases :
* - If the pointer table is embedded, this operation could force an
* external pointer table.
* - If this already has an external pointer table this operation
* could extend the table.
*/
if (add) {
if (zap_f_phys(zap)->zap_ptrtbl.zt_blk == 0) {
(void) refcount_add_many(towrite,
zap->zap_dbuf->db_size, FTAG);
} else {
(void) refcount_add_many(towrite,
zap->zap_dbuf->db_size * 3, FTAG);
}
}
/*
* Now, check if the block containing leaf is freeable
* and account accordingly.
*/
err = zap_deref_leaf(zap, zn->zn_hash, NULL, RW_READER, &l);
if (err != 0) {
return (err);
}
if (!add && dmu_buf_freeable(l->l_dbuf)) {
(void) refcount_add_many(tooverwrite, l->l_dbuf->db_size, FTAG);
} else {
/*
* If this an add operation, the leaf block could split.
* Hence, we need to account for an additional leaf block.
*/
(void) refcount_add_many(towrite,
(add ? 2 : 1) * l->l_dbuf->db_size, FTAG);
}
zap_put_leaf(l);
return (0);
}

View File

@ -1594,88 +1594,6 @@ zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs)
return (0); return (0);
} }
int
zap_count_write_by_dnode(dnode_t *dn, const char *name, int add,
refcount_t *towrite, refcount_t *tooverwrite)
{
zap_t *zap;
int err = 0;
/*
* Since, we don't have a name, we cannot figure out which blocks will
* be affected in this operation. So, account for the worst case :
* - 3 blocks overwritten: target leaf, ptrtbl block, header block
* - 4 new blocks written if adding:
* - 2 blocks for possibly split leaves,
* - 2 grown ptrtbl blocks
*
* This also accommodates the case where an add operation to a fairly
* large microzap results in a promotion to fatzap.
*/
if (name == NULL) {
(void) refcount_add_many(towrite,
(3 + (add ? 4 : 0)) * SPA_OLD_MAXBLOCKSIZE, FTAG);
return (err);
}
/*
* We lock the zap with adding == FALSE. Because, if we pass
* the actual value of add, it could trigger a mzap_upgrade().
* At present we are just evaluating the possibility of this operation
* and hence we do not want to trigger an upgrade.
*/
err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
FTAG, &zap);
if (err != 0)
return (err);
if (!zap->zap_ismicro) {
zap_name_t *zn = zap_name_alloc(zap, name, 0);
if (zn) {
err = fzap_count_write(zn, add, towrite,
tooverwrite);
zap_name_free(zn);
} else {
/*
* We treat this case as similar to (name == NULL)
*/
(void) refcount_add_many(towrite,
(3 + (add ? 4 : 0)) * SPA_OLD_MAXBLOCKSIZE, FTAG);
}
} else {
/*
* We are here if (name != NULL) and this is a micro-zap.
* We account for the header block depending on whether it
* is freeable.
*
* Incase of an add-operation it is hard to find out
* if this add will promote this microzap to fatzap.
* Hence, we consider the worst case and account for the
* blocks assuming this microzap would be promoted to a
* fatzap.
*
* 1 block overwritten : header block
* 4 new blocks written : 2 new split leaf, 2 grown
* ptrtbl blocks
*/
if (dmu_buf_freeable(zap->zap_dbuf)) {
(void) refcount_add_many(tooverwrite,
MZAP_MAX_BLKSZ, FTAG);
} else {
(void) refcount_add_many(towrite,
MZAP_MAX_BLKSZ, FTAG);
}
if (add) {
(void) refcount_add_many(towrite,
4 * MZAP_MAX_BLKSZ, FTAG);
}
}
zap_unlockdir(zap, FTAG);
return (err);
}
#if defined(_KERNEL) && defined(HAVE_SPL) #if defined(_KERNEL) && defined(HAVE_SPL)
EXPORT_SYMBOL(zap_create); EXPORT_SYMBOL(zap_create);
EXPORT_SYMBOL(zap_create_dnsize); EXPORT_SYMBOL(zap_create_dnsize);
@ -1694,7 +1612,6 @@ EXPORT_SYMBOL(zap_lookup_uint64);
EXPORT_SYMBOL(zap_contains); EXPORT_SYMBOL(zap_contains);
EXPORT_SYMBOL(zap_prefetch); EXPORT_SYMBOL(zap_prefetch);
EXPORT_SYMBOL(zap_prefetch_uint64); EXPORT_SYMBOL(zap_prefetch_uint64);
EXPORT_SYMBOL(zap_count_write_by_dnode);
EXPORT_SYMBOL(zap_add); EXPORT_SYMBOL(zap_add);
EXPORT_SYMBOL(zap_add_by_dnode); EXPORT_SYMBOL(zap_add_by_dnode);
EXPORT_SYMBOL(zap_add_uint64); EXPORT_SYMBOL(zap_add_uint64);