From 498877baf5038b32c1531e5ec96b435023200f4d Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Thu, 16 May 2013 14:18:06 -0700 Subject: [PATCH] Illumos #3112, #3113, #3114 3112 ztest does not honor ZFS_DEBUG 3113 ztest should use watchpoints to protect frozen arc bufs 3114 some leaked nvlists in zfsdev_ioctl Reviewed by: Adam Leventhal Reviewed by: Matt Amdur Reviewed by: George Wilson Reviewed by: Christopher Siden Approved by: Eric Schrock References: https://www.illumos.org/issues/3112 https://www.illumos.org/issues/3113 https://www.illumos.org/issues/3114 illumos/illumos-gate@cd1c8b85eb30b568e9816221430c479ace7a559d The /proc/self/cmd watchpoint interface is specific to Solaris. Therefore, the #3113 implementation was reworked to use the more portable mprotect(2) system call. When the pages are watched they are marked read-only for protection. Any write to the protected address range immediately trigger a SIGSEGV. The pages are marked writable again when they are unwatched. Ported-by: Brian Behlendorf Issue #1489 --- cmd/ztest/ztest.c | 3 +- include/sys/arc.h | 5 ++++ include/sys/zfs_context.h | 2 ++ module/zfs/arc.c | 60 +++++++++++++++++++++++++++++++++------ module/zfs/spa_misc.c | 17 +++++++++++ module/zfs/zio.c | 14 +++++++-- 6 files changed, 89 insertions(+), 12 deletions(-) diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index 7b09532c05..3b26f8d1eb 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -6210,11 +6210,12 @@ main(int argc, char **argv) (void) setvbuf(stdout, NULL, _IOLBF, 0); + dprintf_setup(&argc, argv); + ztest_fd_rand = open("/dev/urandom", O_RDONLY); ASSERT3S(ztest_fd_rand, >=, 0); if (!fd_data_str) { - dprintf_setup(&argc, argv); process_options(argc, argv); setup_data_fd(); diff --git a/include/sys/arc.h b/include/sys/arc.h index efafb551ac..221946da30 100644 --- a/include/sys/arc.h +++ b/include/sys/arc.h @@ -136,6 +136,7 @@ int arc_buf_size(arc_buf_t *buf); void arc_release(arc_buf_t *buf, void *tag); int arc_released(arc_buf_t *buf); int arc_has_callback(arc_buf_t *buf); +void arc_buf_sigsegv(int sig, siginfo_t *si, void *unused); void arc_buf_freeze(arc_buf_t *buf); void arc_buf_thaw(arc_buf_t *buf); boolean_t arc_buf_eviction_needed(arc_buf_t *buf); @@ -183,6 +184,10 @@ extern int zfs_write_limit_shift; extern unsigned long zfs_write_limit_max; extern kmutex_t zfs_write_limit_lock; +#ifndef _KERNEL +extern boolean_t arc_watch; +#endif + #ifdef __cplusplus } #endif diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h index adb152f58b..28a1306aa4 100644 --- a/include/sys/zfs_context.h +++ b/include/sys/zfs_context.h @@ -97,6 +97,8 @@ #include #include #include +#include +#include #include #include #include diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 04f87a810b..5bba9a317e 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -145,6 +145,11 @@ #include #include +#ifndef _KERNEL +/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ +boolean_t arc_watch = B_FALSE; +#endif + static kmutex_t arc_reclaim_thr_lock; static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ static uint8_t arc_thread_exit; @@ -569,6 +574,7 @@ static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock); static int arc_evict_needed(arc_buf_contents_t type); static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes, arc_buf_contents_t type); +static void arc_buf_watch(arc_buf_t *buf); static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab); @@ -1060,6 +1066,37 @@ arc_cksum_compute(arc_buf_t *buf, boolean_t force) fletcher_2_native(buf->b_data, buf->b_hdr->b_size, buf->b_hdr->b_freeze_cksum); mutex_exit(&buf->b_hdr->b_freeze_lock); + arc_buf_watch(buf); +} + +#ifndef _KERNEL +void +arc_buf_sigsegv(int sig, siginfo_t *si, void *unused) +{ + panic("Got SIGSEGV at address: 0x%lx\n", (long) si->si_addr); +} +#endif + +/* ARGSUSED */ +static void +arc_buf_unwatch(arc_buf_t *buf) +{ +#ifndef _KERNEL + if (arc_watch) { + ASSERT0(mprotect(buf->b_data, buf->b_hdr->b_size, + PROT_READ | PROT_WRITE)); + } +#endif +} + +/* ARGSUSED */ +static void +arc_buf_watch(arc_buf_t *buf) +{ +#ifndef _KERNEL + if (arc_watch) + ASSERT0(mprotect(buf->b_data, buf->b_hdr->b_size, PROT_READ)); +#endif } void @@ -1080,6 +1117,8 @@ arc_buf_thaw(arc_buf_t *buf) } mutex_exit(&buf->b_hdr->b_freeze_lock); + + arc_buf_unwatch(buf); } void @@ -1097,6 +1136,7 @@ arc_buf_freeze(arc_buf_t *buf) buf->b_hdr->b_state == arc_anon); arc_cksum_compute(buf, B_FALSE); mutex_exit(hash_lock); + } static void @@ -1504,21 +1544,22 @@ arc_buf_add_ref(arc_buf_t *buf, void* tag) * the buffer is placed on l2arc_free_on_write to be freed later. */ static void -arc_buf_data_free(arc_buf_hdr_t *hdr, void (*free_func)(void *, size_t), - void *data, size_t size) +arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t)) { + arc_buf_hdr_t *hdr = buf->b_hdr; + if (HDR_L2_WRITING(hdr)) { l2arc_data_free_t *df; df = kmem_alloc(sizeof (l2arc_data_free_t), KM_PUSHPAGE); - df->l2df_data = data; - df->l2df_size = size; + df->l2df_data = buf->b_data; + df->l2df_size = hdr->b_size; df->l2df_func = free_func; mutex_enter(&l2arc_free_on_write_mtx); list_insert_head(l2arc_free_on_write, df); mutex_exit(&l2arc_free_on_write_mtx); ARCSTAT_BUMP(arcstat_l2_free_on_write); } else { - free_func(data, size); + free_func(buf->b_data, hdr->b_size); } } @@ -1534,16 +1575,15 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) arc_buf_contents_t type = buf->b_hdr->b_type; arc_cksum_verify(buf); + arc_buf_unwatch(buf); if (!recycle) { if (type == ARC_BUFC_METADATA) { - arc_buf_data_free(buf->b_hdr, zio_buf_free, - buf->b_data, size); + arc_buf_data_free(buf, zio_buf_free); arc_space_return(size, ARC_SPACE_DATA); } else { ASSERT(type == ARC_BUFC_DATA); - arc_buf_data_free(buf->b_hdr, - zio_data_buf_free, buf->b_data, size); + arc_buf_data_free(buf, zio_data_buf_free); ARCSTAT_INCR(arcstat_data_size, -size); atomic_add_64(&arc_size, -size); } @@ -2908,6 +2948,7 @@ arc_read_done(zio_t *zio) } arc_cksum_compute(buf, B_FALSE); + arc_buf_watch(buf); if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) { /* @@ -3542,6 +3583,7 @@ arc_release(arc_buf_t *buf, void *tag) } hdr->b_datacnt -= 1; arc_cksum_verify(buf); + arc_buf_unwatch(buf); mutex_exit(hash_lock); diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 7bdfbf7389..91e7fdf358 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -1630,6 +1630,23 @@ spa_init(int mode) spa_mode_global = mode; +#ifndef _KERNEL + if (spa_mode_global != FREAD && dprintf_find_string("watch")) { + struct sigaction sa; + + sa.sa_flags = SA_SIGINFO; + sigemptyset(&sa.sa_mask); + sa.sa_sigaction = arc_buf_sigsegv; + + if (sigaction(SIGSEGV, &sa, NULL) == -1) { + perror("could not enable watchpoints: " + "sigaction(SIGSEGV, ...) = "); + } else { + arc_watch = B_TRUE; + } + } +#endif + fm_init(); refcount_init(); unique_init(); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 1f803a44b2..f0c9c0f08d 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -169,11 +169,21 @@ zio_init(void) while (p2 & (p2 - 1)) p2 &= p2 - 1; +#ifndef _KERNEL + /* + * If we are using watchpoints, put each buffer on its own page, + * to eliminate the performance overhead of trapping to the + * kernel when modifying a non-watched buffer that shares the + * page with a watched buffer. + */ + if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE)) + continue; +#endif if (size <= 4 * SPA_MINBLOCKSIZE) { align = SPA_MINBLOCKSIZE; - } else if (P2PHASE(size, PAGESIZE) == 0) { + } else if (IS_P2ALIGNED(size, PAGESIZE)) { align = PAGESIZE; - } else if (P2PHASE(size, p2 >> 2) == 0) { + } else if (IS_P2ALIGNED(size, p2 >> 2)) { align = p2 >> 2; }