From 539d33c791da2f970cfa5a1bddf0533b23146265 Mon Sep 17 00:00:00 2001 From: George Melikov Date: Wed, 1 Feb 2017 01:44:03 +0300 Subject: [PATCH] OpenZFS 6569 - large file delete can starve out write ops Authored by: Alek Pinchuk Reviewed by: Matt Ahrens Reviewed by: Sanjay Nadkarni Reviewed by: Pavel Zakharov Reviewed-by: Brian Behlendorf Ported-by: George Melikov Tested-by: kernelOfTruth OpenZFS-issue: https://www.illumos.org/issues/6569 OpenZFS-commit: https://github.com/openzfs/openzfs/commit/1bf4b6f2 Closes #5706 --- include/sys/dsl_pool.h | 2 ++ include/sys/trace_dmu.h | 30 ++++++++++++++++++++++ module/zfs/dmu.c | 56 +++++++++++++++++++++++++++++++++++++---- module/zfs/dsl_pool.c | 11 ++++++++ 4 files changed, 94 insertions(+), 5 deletions(-) diff --git a/include/sys/dsl_pool.h b/include/sys/dsl_pool.h index 48b12e8eb1..b509d312b6 100644 --- a/include/sys/dsl_pool.h +++ b/include/sys/dsl_pool.h @@ -21,6 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright 2016 Nexenta Systems, Inc. All rights reserved. */ #ifndef _SYS_DSL_POOL_H @@ -106,6 +107,7 @@ typedef struct dsl_pool { kcondvar_t dp_spaceavail_cv; uint64_t dp_dirty_pertxg[TXG_SIZE]; uint64_t dp_dirty_total; + uint64_t dp_long_free_dirty_pertxg[TXG_SIZE]; uint64_t dp_mos_used_delta; uint64_t dp_mos_compressed_delta; uint64_t dp_mos_uncompressed_delta; diff --git a/include/sys/trace_dmu.h b/include/sys/trace_dmu.h index 916c9bdbae..b2f37a6be4 100644 --- a/include/sys/trace_dmu.h +++ b/include/sys/trace_dmu.h @@ -112,6 +112,36 @@ DEFINE_EVENT(zfs_delay_mintime_class, name, \ /* END CSTYLED */ DEFINE_DELAY_MINTIME_EVENT(zfs_delay__mintime); +/* BEGIN CSTYLED */ +DECLARE_EVENT_CLASS(zfs_free_long_range_class, + TP_PROTO(uint64_t long_free_dirty_all_txgs, uint64_t chunk_len, \ + uint64_t txg), + TP_ARGS(long_free_dirty_all_txgs, chunk_len, txg), + TP_STRUCT__entry( + __field(uint64_t, long_free_dirty_all_txgs) + __field(uint64_t, chunk_len) + __field(uint64_t, txg) + ), + TP_fast_assign( + __entry->long_free_dirty_all_txgs = long_free_dirty_all_txgs; + __entry->chunk_len = chunk_len; + __entry->txg = txg; + ), + TP_printk("long_free_dirty_all_txgs %llu chunk_len %llu txg %llu", + __entry->long_free_dirty_all_txgs, + __entry->chunk_len, __entry->txg) +); +/* END CSTYLED */ + +/* BEGIN CSTYLED */ +#define DEFINE_FREE_LONG_RANGE_EVENT(name) \ +DEFINE_EVENT(zfs_free_long_range_class, name, \ + TP_PROTO(uint64_t long_free_dirty_all_txgs, \ + uint64_t chunk_len, uint64_t txg), \ + TP_ARGS(long_free_dirty_all_txgs, chunk_len, txg)) +/* END CSTYLED */ +DEFINE_FREE_LONG_RANGE_EVENT(zfs_free__long__range); + #endif /* _TRACE_DMU_H */ #undef TRACE_INCLUDE_PATH diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index cdbcfe2505..b0bceac254 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -48,6 +48,7 @@ #include #include #include +#include #ifdef _KERNEL #include #include @@ -58,6 +59,14 @@ */ int zfs_nopwrite_enabled = 1; +/* + * Tunable to control percentage of dirtied blocks from frees in one TXG. + * After this threshold is crossed, additional dirty blocks from frees + * wait until the next TXG. + * A value of zero will disable this throttle. + */ +uint32_t zfs_per_txg_dirty_frees_percent = 30; + const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { { DMU_BSWAP_UINT8, TRUE, "unallocated" }, { DMU_BSWAP_ZAP, TRUE, "object directory" }, @@ -727,6 +736,9 @@ dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset, { uint64_t object_size; int err; + uint64_t dirty_frees_threshold; + dsl_pool_t *dp = dmu_objset_pool(os); + int t; if (dn == NULL) return (SET_ERROR(EINVAL)); @@ -735,11 +747,18 @@ dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset, if (offset >= object_size) return (0); + if (zfs_per_txg_dirty_frees_percent <= 100) + dirty_frees_threshold = + zfs_per_txg_dirty_frees_percent * zfs_dirty_data_max / 100; + else + dirty_frees_threshold = zfs_dirty_data_max / 4; + if (length == DMU_OBJECT_END || offset + length > object_size) length = object_size - offset; while (length != 0) { - uint64_t chunk_end, chunk_begin; + uint64_t chunk_end, chunk_begin, chunk_len; + uint64_t long_free_dirty_all_txgs = 0; dmu_tx_t *tx; if (dmu_objset_zfs_unmounting(dn->dn_objset)) @@ -754,9 +773,28 @@ dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset, ASSERT3U(chunk_begin, >=, offset); ASSERT3U(chunk_begin, <=, chunk_end); + chunk_len = chunk_end - chunk_begin; + + mutex_enter(&dp->dp_lock); + for (t = 0; t < TXG_SIZE; t++) { + long_free_dirty_all_txgs += + dp->dp_long_free_dirty_pertxg[t]; + } + mutex_exit(&dp->dp_lock); + + /* + * To avoid filling up a TXG with just frees wait for + * the next TXG to open before freeing more chunks if + * we have reached the threshold of frees + */ + if (dirty_frees_threshold != 0 && + long_free_dirty_all_txgs >= dirty_frees_threshold) { + txg_wait_open(dp, 0); + continue; + } + tx = dmu_tx_create(os); - dmu_tx_hold_free(tx, dn->dn_object, - chunk_begin, chunk_end - chunk_begin); + dmu_tx_hold_free(tx, dn->dn_object, chunk_begin, chunk_len); /* * Mark this transaction as typically resulting in a net @@ -768,10 +806,18 @@ dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset, dmu_tx_abort(tx); return (err); } - dnode_free_range(dn, chunk_begin, chunk_end - chunk_begin, tx); + + mutex_enter(&dp->dp_lock); + dp->dp_long_free_dirty_pertxg[dmu_tx_get_txg(tx) & TXG_MASK] += + chunk_len; + mutex_exit(&dp->dp_lock); + DTRACE_PROBE3(free__long__range, + uint64_t, long_free_dirty_all_txgs, uint64_t, chunk_len, + uint64_t, dmu_tx_get_txg(tx)); + dnode_free_range(dn, chunk_begin, chunk_len, tx); dmu_tx_commit(tx); - length -= chunk_end - chunk_begin; + length -= chunk_len; } return (0); } diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index 2ff3ae4568..1b8b780aa4 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -23,6 +23,7 @@ * Copyright (c) 2011, 2015 by Delphix. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright 2016 Nexenta Systems, Inc. All rights reserved. */ #include @@ -509,6 +510,16 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) */ dsl_pool_undirty_space(dp, dp->dp_dirty_pertxg[txg & TXG_MASK], txg); + /* + * Update the long range free counter after + * we're done syncing user data + */ + mutex_enter(&dp->dp_lock); + ASSERT(spa_sync_pass(dp->dp_spa) == 1 || + dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] == 0); + dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] = 0; + mutex_exit(&dp->dp_lock); + /* * After the data blocks have been written (ensured by the zio_wait() * above), update the user/group space accounting.