From 98d5d8bd50f49eaed7ea847f07af3cd0e7cb7454 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 14 May 2010 11:57:48 -0700 Subject: [PATCH 1/3] Add missing include path for FMA aware zpool command. --- cmd/zpool/Makefile.am | 1 + 1 file changed, 1 insertion(+) diff --git a/cmd/zpool/Makefile.am b/cmd/zpool/Makefile.am index 3f30eff47d..ec9757bd62 100644 --- a/cmd/zpool/Makefile.am +++ b/cmd/zpool/Makefile.am @@ -7,6 +7,7 @@ DEFAULT_INCLUDES += \ -I${top_srcdir}/lib/libzfs/include \ -I${top_srcdir}/lib/libnvpair/include \ -I${top_srcdir}/module/zcommon/include \ + -I${top_srcdir}/module/zfs/include \ -I${top_srcdir}/module/nvpair/include \ -I${top_srcdir}/module/avl/include \ -I${top_srcdir}/module/unicode/include From 97d19a5e45cdfef88d5e3fa25aa03b009e0257aa Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 14 May 2010 12:40:44 -0700 Subject: [PATCH 2/3] Add linux-events topic branch for zevent handling. This topic branch leverages the Solaris style FMA call points in ZFS to create a user space visible event notification system under Linux. This new system is called zevent and it unifies all previous Solaris style ereports and sysevent notifications. Under this Linux specific scheme when a sysevent or ereport event occurs an nvlist describing the event is created which looks almost exactly like a Solaris ereport. These events are queued up in the kernel when they occur and conditionally logged to the console. It is then up to a user space application to consume the events and do whatever it likes with them. To make this possible the existing /dev/zfs ABI has been extended with two new ioctls which behave as follows. * ZFS_IOC_EVENTS_NEXT Get the next pending event. The kernel will keep track of the last event consumed by the file descriptor and provide the next one if available. If no new events are available the ioctl() will block waiting for the next event. This ioctl may also be called in a non-blocking mode by setting zc.zc_guid = ZEVENT_NONBLOCK. In the non-blocking case if no events are available ENOENT will be returned. It is possible that ESHUTDOWN will be returned if the ioctl() is called while module unloading is in progress. And finally ENOMEM may occur if the provided nvlist buffer is not large enough to contain the entire event. * ZFS_IOC_EVENTS_CLEAR Clear are events queued by the kernel. The kernel will keep a fairly large number of recent events queued, use this ioctl to clear the in kernel list. This will effect all user space processes consuming events. The zpool command has been extended to use this events ABI with the 'events' subcommand. You may run 'zpool events -v' to output a verbose log of all recent events. This is very similar to the Solaris 'fmdump -ev' command with the key difference being it also includes what would be considered sysevents under Solaris. You may also run in follow mode with the '-f' option. To clear the in kernel event queue use the '-c' option. $ sudo cmd/zpool/zpool events -fv TIME CLASS May 13 2010 16:31:15.777711000 ereport.fs.zfs.config.sync class = "ereport.fs.zfs.config.sync" ena = 0x40982b7897700001 detector = (embedded nvlist) version = 0x0 scheme = "zfs" pool = 0xed976600de75dfa6 (end detector) time = 0x4bec8bc3 0x2e5aed98 pool = "zpios" pool_guid = 0xed976600de75dfa6 pool_context = 0x0 While the 'zpool events' command is handy for interactive debugging it is not expected to be the primary consumer of zevents. This ABI was primarily added to facilitate the addition of a user space monitoring daemon. This daemon would consume all events posted by the kernel and based on the type of event perform an action. For most events simply forwarding them on to syslog is likely enough. But this interface also cleanly allows for more sophisticated actions to be taken such as generating an email for a failed drive --- .topdeps | 4 +- .topmsg | 77 ++- cmd/zpool/zpool_main.c | 328 ++++++++++++- lib/libzfs/include/libzfs.h | 2 + lib/libzfs/libzfs_pool.c | 86 ++++ lib/libzpool/include/sys/zfs_context.h | 4 +- module/zcommon/include/sys/fs/zfs.h | 4 +- module/zfs/dsl_scrub.c | 7 +- module/zfs/fm.c | 650 +++++++++++++++---------- module/zfs/include/sys/fm/fs/zfs.h | 15 +- module/zfs/include/sys/fm/protocol.h | 15 +- module/zfs/include/sys/fm/util.h | 39 +- module/zfs/include/sys/spa.h | 2 - module/zfs/include/sys/zfs_context.h | 5 +- module/zfs/spa.c | 74 +-- module/zfs/spa_config.c | 2 +- module/zfs/spa_misc.c | 3 + module/zfs/vdev.c | 5 +- module/zfs/zfs_fm.c | 68 +-- 19 files changed, 953 insertions(+), 437 deletions(-) diff --git a/.topdeps b/.topdeps index 607c231780..7f16cbcdd5 100644 --- a/.topdeps +++ b/.topdeps @@ -1,3 +1 @@ -gcc-branch -fix-branch -feature-branch +zfs-branch diff --git a/.topmsg b/.topmsg index 03967cdff5..410295fbf7 100644 --- a/.topmsg +++ b/.topmsg @@ -1,19 +1,70 @@ From: Brian Behlendorf -Subject: [PATCH] zfs branch +Subject: [PATCH] linux-events -Merged result of all changes which are relevant to both Solaris -and Linux builds of the ZFS code. These are changes where there -is a reasonable chance they will be accepted upstream. +This topic branch leverages the Solaris style FMA call points +in ZFS to create a user space visible event notification system +under Linux. This new system is called zevent and it unifies +all previous Solaris style ereports and sysevent notifications. -Additionally, since this is effectively the root of the linux -ZFS tree the core linux build system is added here. This -includes autogen.sh, configure.ac, m4 macros, some scripts/*, -and makefiles for all the core ZFS components. Linux-only -features which require tweaks to the build system should appear -on the relevant topic branches. All autotools products which -result from autogen.sh are commited to the linux-configure-branch. +Under this Linux specific scheme when a sysevent or ereport event +occurs an nvlist describing the event is created which looks almost +exactly like a Solaris ereport. These events are queued up in the +kernel when they occur and conditionally logged to the console. +It is then up to a user space application to consume the events +and do whatever it likes with them. -This branch also contains the META, ChangeLog, AUTHORS, TODO, -and README, files. +To make this possible the existing /dev/zfs ABI has been extended +with two new ioctls which behave as follows. + +* ZFS_IOC_EVENTS_NEXT +Get the next pending event. The kernel will keep track of the last +event consumed by the file descriptor and provide the next one if +available. If no new events are available the ioctl() will block +waiting for the next event. This ioctl may also be called in a +non-blocking mode by setting zc.zc_guid = ZEVENT_NONBLOCK. In the +non-blocking case if no events are available ENOENT will be returned. +It is possible that ESHUTDOWN will be returned if the ioctl() is +called while module unloading is in progress. And finally ENOMEM +may occur if the provided nvlist buffer is not large enough to +contain the entire event. + +* ZFS_IOC_EVENTS_CLEAR +Clear are events queued by the kernel. The kernel will keep a fairly +large number of recent events queued, use this ioctl to clear the +in kernel list. This will effect all user space processes consuming +events. + +The zpool command has been extended to use this events ABI with the +'events' subcommand. You may run 'zpool events -v' to output a +verbose log of all recent events. This is very similar to the +Solaris 'fmdump -ev' command with the key difference being it also +includes what would be considered sysevents under Solaris. You +may also run in follow mode with the '-f' option. To clear the +in kernel event queue use the '-c' option. + +$ sudo cmd/zpool/zpool events -fv +TIME CLASS +May 13 2010 16:31:15.777711000 ereport.fs.zfs.config.sync + class = "ereport.fs.zfs.config.sync" + ena = 0x40982b7897700001 + detector = (embedded nvlist) + version = 0x0 + scheme = "zfs" + pool = 0xed976600de75dfa6 + (end detector) + + time = 0x4bec8bc3 0x2e5aed98 + pool = "zpios" + pool_guid = 0xed976600de75dfa6 + pool_context = 0x0 + +While the 'zpool events' command is handy for interactive debugging +it is not expected to be the primary consumer of zevents. This ABI +was primarily added to facilitate the addition of a user space +monitoring daemon. This daemon would consume all events posted by +the kernel and based on the type of event perform an action. For +most events simply forwarding them on to syslog is likely enough. +But this interface also cleanly allows for more sophisticated +actions to be taken such as generating an email for a failed drive. Signed-off-by: Brian Behlendorf diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 3cdc269b08..8f115af2cb 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -42,8 +42,9 @@ #include #include #include - #include +#include +#include #include @@ -76,6 +77,7 @@ static int zpool_do_export(int, char **); static int zpool_do_upgrade(int, char **); static int zpool_do_history(int, char **); +static int zpool_do_events(int, char **); static int zpool_do_get(int, char **); static int zpool_do_set(int, char **); @@ -118,6 +120,7 @@ typedef enum { HELP_SCRUB, HELP_STATUS, HELP_UPGRADE, + HELP_EVENTS, HELP_GET, HELP_SET } zpool_help_t; @@ -164,6 +167,8 @@ static zpool_command_t command_table[] = { { "upgrade", zpool_do_upgrade, HELP_UPGRADE }, { NULL }, { "history", zpool_do_history, HELP_HISTORY }, + { "events", zpool_do_events, HELP_EVENTS }, + { NULL }, { "get", zpool_do_get, HELP_GET }, { "set", zpool_do_set, HELP_SET }, }; @@ -225,6 +230,8 @@ get_usage(zpool_help_t idx) { return (gettext("\tupgrade\n" "\tupgrade -v\n" "\tupgrade [-V version] <-a | pool ...>\n")); + case HELP_EVENTS: + return (gettext("\tevents [-vfc]\n")); case HELP_GET: return (gettext("\tget <\"all\" | property[,...]> " " ...\n")); @@ -3761,6 +3768,325 @@ zpool_do_history(int argc, char **argv) return (ret); } +typedef struct ev_opts { + int verbose; + int follow; + int clear; +} ev_opts_t; + +static void +zpool_do_events_short(nvlist_t *nvl) +{ + char ctime_str[26], str[32], *ptr; + int64_t *tv; + uint_t n; + + verify(nvlist_lookup_int64_array(nvl, FM_EREPORT_TIME, &tv, &n) == 0); + memset(str, ' ', 32); + (void) ctime_r(&tv[0], ctime_str); + (void) strncpy(str, ctime_str+4, 6); /* 'Jun 30' */ + (void) strncpy(str+7, ctime_str+20, 4); /* '1993' */ + (void) strncpy(str+12, ctime_str+11, 8); /* '21:49:08' */ + (void) sprintf(str+20, ".%09lld", (longlong_t)tv[1]);/* '.123456789' */ + (void) printf(gettext("%s "), str); + + verify(nvlist_lookup_string(nvl, FM_CLASS, &ptr) == 0); + (void) printf(gettext("%s\n"), ptr); +} + +static void +zpool_do_events_nvprint(nvlist_t *nvl, int depth) +{ + nvpair_t *nvp; + + for (nvp = nvlist_next_nvpair(nvl, NULL); + nvp != NULL; nvp = nvlist_next_nvpair(nvl, nvp)) { + + data_type_t type = nvpair_type(nvp); + const char *name = nvpair_name(nvp); + + boolean_t b; + uint8_t i8; + uint16_t i16; + uint32_t i32; + uint64_t i64; + char *str; + nvlist_t *cnv; + + printf(gettext("%*s%s = "), depth, "", name); + + switch (type) { + case DATA_TYPE_BOOLEAN: + printf(gettext("%s"), "1"); + break; + + case DATA_TYPE_BOOLEAN_VALUE: + (void) nvpair_value_boolean_value(nvp, &b); + printf(gettext("%s"), b ? "1" : "0"); + break; + + case DATA_TYPE_BYTE: + (void) nvpair_value_byte(nvp, &i8); + printf(gettext("0x%x"), i8); + break; + + case DATA_TYPE_INT8: + (void) nvpair_value_int8(nvp, (void *)&i8); + printf(gettext("0x%x"), i8); + break; + + case DATA_TYPE_UINT8: + (void) nvpair_value_uint8(nvp, &i8); + printf(gettext("0x%x"), i8); + break; + + case DATA_TYPE_INT16: + (void) nvpair_value_int16(nvp, (void *)&i16); + printf(gettext("0x%x"), i16); + break; + + case DATA_TYPE_UINT16: + (void) nvpair_value_uint16(nvp, &i16); + printf(gettext("0x%x"), i16); + break; + + case DATA_TYPE_INT32: + (void) nvpair_value_int32(nvp, (void *)&i32); + printf(gettext("0x%x"), i32); + break; + + case DATA_TYPE_UINT32: + (void) nvpair_value_uint32(nvp, &i32); + printf(gettext("0x%x"), i32); + break; + + case DATA_TYPE_INT64: + (void) nvpair_value_int64(nvp, (void *)&i64); + printf(gettext("0x%llx"), (u_longlong_t)i64); + break; + + case DATA_TYPE_UINT64: + (void) nvpair_value_uint64(nvp, &i64); + printf(gettext("0x%llx"), (u_longlong_t)i64); + break; + + case DATA_TYPE_HRTIME: + (void) nvpair_value_hrtime(nvp, (void *)&i64); + printf(gettext("0x%llx"), (u_longlong_t)i64); + break; + + case DATA_TYPE_STRING: + (void) nvpair_value_string(nvp, &str); + printf(gettext("\"%s\""), str ? str : ""); + break; + + case DATA_TYPE_NVLIST: + printf(gettext("(embedded nvlist)\n")); + (void) nvpair_value_nvlist(nvp, &cnv); + zpool_do_events_nvprint(cnv, depth + 8); + printf(gettext("%*s(end %s)\n"), depth, "", name); + break; + + case DATA_TYPE_NVLIST_ARRAY: { + nvlist_t **val; + uint_t i, nelem; + + (void) nvpair_value_nvlist_array(nvp, &val, &nelem); + printf(gettext("(%d embedded nvlists)\n"), nelem); + for (i = 0; i < nelem; i++) { + printf(gettext("%*s%s[%d] = %s\n"), + depth, "", name, i, "(embedded nvlist)"); + zpool_do_events_nvprint(val[i], depth + 8); + printf(gettext("%*s(end %s[%i])\n"), + depth, "", name, i); + } + printf(gettext("%*s(end %s)\n"), depth, "", name); + } + break; + + case DATA_TYPE_INT8_ARRAY: { + int8_t *val; + uint_t i, nelem; + + (void) nvpair_value_int8_array(nvp, &val, &nelem); + for (i = 0; i < nelem; i++) + printf(gettext("0x%x "), val[i]); + + break; + } + + case DATA_TYPE_UINT8_ARRAY: { + uint8_t *val; + uint_t i, nelem; + + (void) nvpair_value_uint8_array(nvp, &val, &nelem); + for (i = 0; i < nelem; i++) + printf(gettext("0x%x "), val[i]); + + break; + } + + case DATA_TYPE_INT16_ARRAY: { + int16_t *val; + uint_t i, nelem; + + (void) nvpair_value_int16_array(nvp, &val, &nelem); + for (i = 0; i < nelem; i++) + printf(gettext("0x%x "), val[i]); + + break; + } + + case DATA_TYPE_UINT16_ARRAY: { + uint16_t *val; + uint_t i, nelem; + + (void) nvpair_value_uint16_array(nvp, &val, &nelem); + for (i = 0; i < nelem; i++) + printf(gettext("0x%x "), val[i]); + + break; + } + + case DATA_TYPE_INT32_ARRAY: { + int32_t *val; + uint_t i, nelem; + + (void) nvpair_value_int32_array(nvp, &val, &nelem); + for (i = 0; i < nelem; i++) + printf(gettext("0x%x "), val[i]); + + break; + } + + case DATA_TYPE_UINT32_ARRAY: { + uint32_t *val; + uint_t i, nelem; + + (void) nvpair_value_uint32_array(nvp, &val, &nelem); + for (i = 0; i < nelem; i++) + printf(gettext("0x%x "), val[i]); + + break; + } + + case DATA_TYPE_INT64_ARRAY: { + int64_t *val; + uint_t i, nelem; + + (void) nvpair_value_int64_array(nvp, &val, &nelem); + for (i = 0; i < nelem; i++) + printf(gettext("0x%llx "), (u_longlong_t)val[i]); + + break; + } + + case DATA_TYPE_UINT64_ARRAY: { + uint64_t *val; + uint_t i, nelem; + + (void) nvpair_value_uint64_array(nvp, &val, &nelem); + for (i = 0; i < nelem; i++) + printf(gettext("0x%llx "), (u_longlong_t)val[i]); + + break; + } + + case DATA_TYPE_STRING_ARRAY: + case DATA_TYPE_BOOLEAN_ARRAY: + case DATA_TYPE_BYTE_ARRAY: + case DATA_TYPE_DOUBLE: + case DATA_TYPE_UNKNOWN: + printf(gettext("")); + break; + } + + printf(gettext("\n")); + } +} + +static int +zpool_do_events_next(ev_opts_t *opts) +{ + nvlist_t *nvl; + int ret, dropped; + + (void) printf(gettext("%-27s %s\n"), "TIME", "CLASS"); + + while (1) { + ret = zpool_events_next(g_zfs, &nvl, &dropped, !!opts->follow); + if (ret || nvl == NULL) + break; + + if (dropped > 0) + (void) printf(gettext("dropped %d events\n"), dropped); + + zpool_do_events_short(nvl); + + if (opts->verbose) { + zpool_do_events_nvprint(nvl, 8); + printf(gettext("\n")); + } + + nvlist_free(nvl); + } + + return (ret); +} + +static int +zpool_do_events_clear(ev_opts_t *opts) +{ + int count, ret; + + ret = zpool_events_clear(g_zfs, &count); + if (!ret) + (void) printf(gettext("cleared %d events\n"), count); + + return (ret); +} + +/* + * zpool events [-vfc] + * + * Displays events logs by ZFS. + */ +int +zpool_do_events(int argc, char **argv) +{ + ev_opts_t opts = { 0 }; + int ret; + int c; + + /* check options */ + while ((c = getopt(argc, argv, "vfc")) != -1) { + switch (c) { + case 'v': + opts.verbose = 1; + break; + case 'f': + opts.follow = 1; + break; + case 'c': + opts.clear = 1; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + argc -= optind; + argv += optind; + + if (opts.clear) + ret = zpool_do_events_clear(&opts); + else + ret = zpool_do_events_next(&opts); + + return ret; +} + static int get_callback(zpool_handle_t *zhp, void *data) { diff --git a/lib/libzfs/include/libzfs.h b/lib/libzfs/include/libzfs.h index f19e398f6a..43b42844d9 100644 --- a/lib/libzfs/include/libzfs.h +++ b/lib/libzfs/include/libzfs.h @@ -339,6 +339,8 @@ extern int zpool_get_history(zpool_handle_t *, nvlist_t **); extern void zpool_set_history_str(const char *subcommand, int argc, char **argv, char *history_str); extern int zpool_stage_history(libzfs_handle_t *, const char *); +extern int zpool_events_next(libzfs_handle_t *, nvlist_t **, int *, int); +extern int zpool_events_clear(libzfs_handle_t *, int *); extern void zpool_obj_to_path(zpool_handle_t *, uint64_t, uint64_t, char *, size_t len); extern int zfs_ioctl(libzfs_handle_t *, int, struct zfs_cmd *); diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index b8989a026e..09ed9d67ff 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -2987,6 +2987,92 @@ zpool_get_history(zpool_handle_t *zhp, nvlist_t **nvhisp) return (err); } +/* + * Retrieve the next event. If there is a new event available 'nvp' will + * contain a newly allocated nvlist and 'dropped' will be set to the number + * of missed events since the last call to this function. When 'nvp' is + * set to NULL it indicates no new events are available. In either case + * the function returns 0 and it is up to the caller to free 'nvp'. In + * the case of a fatal error the function will return a non-zero value. + * When the function is called in blocking mode it will not return until + * a new event is available. + */ +int +zpool_events_next(libzfs_handle_t *hdl, nvlist_t **nvp, int *dropped, int block) +{ + zfs_cmd_t zc = { "\0", "\0", "\0", 0 }; + int error = 0; + + *nvp = NULL; + *dropped = 0; + + if (!block) + zc.zc_guid = ZEVENT_NONBLOCK; + + if (zcmd_alloc_dst_nvlist(hdl, &zc, ZEVENT_SIZE) != 0) + return (-1); + +retry: + if (zfs_ioctl(hdl, ZFS_IOC_EVENTS_NEXT, &zc) != 0) { + switch (errno) { + case ESHUTDOWN: + error = zfs_error_fmt(hdl, EZFS_POOLUNAVAIL, + dgettext(TEXT_DOMAIN, "zfs shutdown")); + goto out; + case ENOENT: + /* Blocking error case should not occur */ + if (block) + error = zpool_standard_error_fmt(hdl, errno, + dgettext(TEXT_DOMAIN, "cannot get event")); + + goto out; + case ENOMEM: + if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) { + error = zfs_error_fmt(hdl, EZFS_NOMEM, + dgettext(TEXT_DOMAIN, "cannot get event")); + goto out; + } else { + goto retry; + } + default: + error = zpool_standard_error_fmt(hdl, errno, + dgettext(TEXT_DOMAIN, "cannot get event")); + goto out; + } + } + + error = zcmd_read_dst_nvlist(hdl, &zc, nvp); + if (error != 0) + goto out; + + *dropped = (int)zc.zc_cookie; +out: + zcmd_free_nvlists(&zc); + + return (error); +} + +/* + * Clear all events. + */ +int +zpool_events_clear(libzfs_handle_t *hdl, int *count) +{ + zfs_cmd_t zc = { "\0", "\0", "\0", 0 }; + char msg[1024]; + + (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, + "cannot clear events")); + + if (zfs_ioctl(hdl, ZFS_IOC_EVENTS_CLEAR, &zc) != 0) + return (zpool_standard_error_fmt(hdl, errno, msg)); + + if (count != NULL) + *count = (int)zc.zc_cookie; /* # of events cleared */ + + return (0); +} + void zpool_obj_to_path(zpool_handle_t *zhp, uint64_t dsobj, uint64_t obj, char *pathname, size_t len) diff --git a/lib/libzpool/include/sys/zfs_context.h b/lib/libzpool/include/sys/zfs_context.h index 7cbac416e2..3f6188da24 100644 --- a/lib/libzpool/include/sys/zfs_context.h +++ b/lib/libzpool/include/sys/zfs_context.h @@ -58,7 +58,6 @@ extern "C" { #include #include #include -#include #include #include #include @@ -72,8 +71,7 @@ extern "C" { #include #include #include -#include -#include +#include /* * Stack diff --git a/module/zcommon/include/sys/fs/zfs.h b/module/zcommon/include/sys/fs/zfs.h index 86b36a8ae9..000aef41bc 100644 --- a/module/zcommon/include/sys/fs/zfs.h +++ b/module/zcommon/include/sys/fs/zfs.h @@ -613,7 +613,9 @@ typedef enum zfs_ioc { ZFS_IOC_USERSPACE_UPGRADE, ZFS_IOC_HOLD, ZFS_IOC_RELEASE, - ZFS_IOC_GET_HOLDS + ZFS_IOC_GET_HOLDS, + ZFS_IOC_EVENTS_NEXT, + ZFS_IOC_EVENTS_CLEAR, } zfs_ioc_t; /* diff --git a/module/zfs/dsl_scrub.c b/module/zfs/dsl_scrub.c index 3d59be3f96..0f1eddb136 100644 --- a/module/zfs/dsl_scrub.c +++ b/module/zfs/dsl_scrub.c @@ -94,12 +94,12 @@ dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) if (vdev_resilver_needed(rvd, &dp->dp_scrub_min_txg, &dp->dp_scrub_max_txg)) { spa_event_notify(dp->dp_spa, NULL, - ESC_ZFS_RESILVER_START); + FM_EREPORT_ZFS_RESILVER_START); dp->dp_scrub_max_txg = MIN(dp->dp_scrub_max_txg, tx->tx_txg); } else { spa_event_notify(dp->dp_spa, NULL, - ESC_ZFS_SCRUB_START); + FM_EREPORT_ZFS_SCRUB_START); } /* zero out the scrub stats in all vdev_stat_t's */ @@ -219,7 +219,8 @@ dsl_pool_scrub_cancel_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) *completep ? dp->dp_scrub_max_txg : 0, B_TRUE); if (*completep) spa_event_notify(dp->dp_spa, NULL, dp->dp_scrub_min_txg ? - ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH); + FM_EREPORT_ZFS_RESILVER_FINISH : + FM_EREPORT_ZFS_SCRUB_FINISH); spa_errlog_rotate(dp->dp_spa); /* diff --git a/module/zfs/fm.c b/module/zfs/fm.c index 3cc979d41b..4c6aee536e 100644 --- a/module/zfs/fm.c +++ b/module/zfs/fm.c @@ -54,49 +54,43 @@ #include #include -#include -#include +#include #include #include -#include #include -#include -#include -#include -#include -#include #include -#include -#include -#include -#include #include #include -#include #include #include +#include +#include +#ifdef _KERNEL +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int zevent_len_max = 0; +int zevent_cols = 80; +int zevent_console = 0; + +static int zevent_len_cur = 0; +static int zevent_waiters = 0; +static int zevent_flags = 0; + +static kmutex_t zevent_lock; +static list_t zevent_list; +static kcondvar_t zevent_cv; +#endif /* _KERNEL */ /* - * URL and SUNW-MSG-ID value to display for fm_panic(), defined below. These - * values must be kept in sync with the FMA source code in usr/src/cmd/fm. - */ -static const char *fm_url = "http://www.sun.com/msg"; -static const char *fm_msgid = "SUNOS-8000-0G"; -static char *volatile fm_panicstr = NULL; - -errorq_t *ereport_errorq; -void *ereport_dumpbuf; -size_t ereport_dumplen; - -static uint_t ereport_chanlen = ERPT_EVCH_MAX; -static evchan_t *ereport_chan = NULL; -static ulong_t ereport_qlen = 0; -static size_t ereport_size = 0; -static int ereport_cols = 80; - -/* - * Common fault management kstats to record ereport generation - * failures + * Common fault management kstats to record event generation failures */ struct erpt_kstat { @@ -113,57 +107,9 @@ static struct erpt_kstat erpt_kstat_data = { { "payload-set-failed", KSTAT_DATA_UINT64 } }; -/*ARGSUSED*/ -static void -fm_drain(void *private, void *data, errorq_elem_t *eep) -{ - nvlist_t *nvl = errorq_elem_nvl(ereport_errorq, eep); +kstat_t *fm_ksp; - if (!panicstr) - (void) fm_ereport_post(nvl, EVCH_TRYHARD); - else - fm_nvprint(nvl); -} - -void -fm_init(void) -{ - kstat_t *ksp; - - (void) sysevent_evc_bind(FM_ERROR_CHAN, - &ereport_chan, EVCH_CREAT | EVCH_HOLD_PEND); - - (void) sysevent_evc_control(ereport_chan, - EVCH_SET_CHAN_LEN, &ereport_chanlen); - - if (ereport_qlen == 0) - ereport_qlen = ERPT_MAX_ERRS * MAX(max_ncpus, 4); - - if (ereport_size == 0) - ereport_size = ERPT_DATA_SZ; - - ereport_errorq = errorq_nvcreate("fm_ereport_queue", - (errorq_func_t)fm_drain, NULL, ereport_qlen, ereport_size, - FM_ERR_PIL, ERRORQ_VITAL); - if (ereport_errorq == NULL) - panic("failed to create required ereport error queue"); - - ereport_dumpbuf = kmem_alloc(ereport_size, KM_SLEEP); - ereport_dumplen = ereport_size; - - /* Initialize ereport allocation and generation kstats */ - ksp = kstat_create("unix", 0, "fm", "misc", KSTAT_TYPE_NAMED, - sizeof (struct erpt_kstat) / sizeof (kstat_named_t), - KSTAT_FLAG_VIRTUAL); - - if (ksp != NULL) { - ksp->ks_data = &erpt_kstat_data; - kstat_install(ksp); - } else { - cmn_err(CE_NOTE, "failed to create fm/misc kstat\n"); - - } -} +#ifdef _KERNEL /* * Formatting utility function for fm_nvprintr. We attempt to wrap chunks of @@ -182,7 +128,7 @@ fm_printf(int depth, int c, int cols, const char *format, ...) va_end(ap); if (c + width >= cols) { - console_printf("\n\r"); + console_printf("\n"); c = 0; if (format[0] != ' ' && depth > 0) { console_printf(" "); @@ -244,54 +190,54 @@ fm_nvprintr(nvlist_t *nvl, int d, int c, int cols) case DATA_TYPE_BYTE: (void) nvpair_value_byte(nvp, &i8); - c = fm_printf(d + 1, c, cols, "%x", i8); + c = fm_printf(d + 1, c, cols, "0x%x", i8); break; case DATA_TYPE_INT8: (void) nvpair_value_int8(nvp, (void *)&i8); - c = fm_printf(d + 1, c, cols, "%x", i8); + c = fm_printf(d + 1, c, cols, "0x%x", i8); break; case DATA_TYPE_UINT8: (void) nvpair_value_uint8(nvp, &i8); - c = fm_printf(d + 1, c, cols, "%x", i8); + c = fm_printf(d + 1, c, cols, "0x%x", i8); break; case DATA_TYPE_INT16: (void) nvpair_value_int16(nvp, (void *)&i16); - c = fm_printf(d + 1, c, cols, "%x", i16); + c = fm_printf(d + 1, c, cols, "0x%x", i16); break; case DATA_TYPE_UINT16: (void) nvpair_value_uint16(nvp, &i16); - c = fm_printf(d + 1, c, cols, "%x", i16); + c = fm_printf(d + 1, c, cols, "0x%x", i16); break; case DATA_TYPE_INT32: (void) nvpair_value_int32(nvp, (void *)&i32); - c = fm_printf(d + 1, c, cols, "%x", i32); + c = fm_printf(d + 1, c, cols, "0x%x", i32); break; case DATA_TYPE_UINT32: (void) nvpair_value_uint32(nvp, &i32); - c = fm_printf(d + 1, c, cols, "%x", i32); + c = fm_printf(d + 1, c, cols, "0x%x", i32); break; case DATA_TYPE_INT64: (void) nvpair_value_int64(nvp, (void *)&i64); - c = fm_printf(d + 1, c, cols, "%llx", + c = fm_printf(d + 1, c, cols, "0x%llx", (u_longlong_t)i64); break; case DATA_TYPE_UINT64: (void) nvpair_value_uint64(nvp, &i64); - c = fm_printf(d + 1, c, cols, "%llx", + c = fm_printf(d + 1, c, cols, "0x%llx", (u_longlong_t)i64); break; case DATA_TYPE_HRTIME: (void) nvpair_value_hrtime(nvp, (void *)&i64); - c = fm_printf(d + 1, c, cols, "%llx", + c = fm_printf(d + 1, c, cols, "0x%llx", (u_longlong_t)i64); break; @@ -321,19 +267,124 @@ fm_nvprintr(nvlist_t *nvl, int d, int c, int cols) } break; + case DATA_TYPE_INT8_ARRAY: { + int8_t *val; + uint_t i, nelem; + + c = fm_printf(d + 1, c, cols, "[ "); + (void) nvpair_value_int8_array(nvp, &val, &nelem); + for (i = 0; i < nelem; i++) + c = fm_printf(d + 1, c, cols, "0x%llx ", + (u_longlong_t)val[i]); + + c = fm_printf(d + 1, c, cols, "]"); + break; + } + + case DATA_TYPE_UINT8_ARRAY: { + uint8_t *val; + uint_t i, nelem; + + c = fm_printf(d + 1, c, cols, "[ "); + (void) nvpair_value_uint8_array(nvp, &val, &nelem); + for (i = 0; i < nelem; i++) + c = fm_printf(d + 1, c, cols, "0x%llx ", + (u_longlong_t)val[i]); + + c = fm_printf(d + 1, c, cols, "]"); + break; + } + + case DATA_TYPE_INT16_ARRAY: { + int16_t *val; + uint_t i, nelem; + + c = fm_printf(d + 1, c, cols, "[ "); + (void) nvpair_value_int16_array(nvp, &val, &nelem); + for (i = 0; i < nelem; i++) + c = fm_printf(d + 1, c, cols, "0x%llx ", + (u_longlong_t)val[i]); + + c = fm_printf(d + 1, c, cols, "]"); + break; + } + + case DATA_TYPE_UINT16_ARRAY: { + uint16_t *val; + uint_t i, nelem; + + c = fm_printf(d + 1, c, cols, "[ "); + (void) nvpair_value_uint16_array(nvp, &val, &nelem); + for (i = 0; i < nelem; i++) + c = fm_printf(d + 1, c, cols, "0x%llx ", + (u_longlong_t)val[i]); + + c = fm_printf(d + 1, c, cols, "]"); + break; + } + + case DATA_TYPE_INT32_ARRAY: { + int32_t *val; + uint_t i, nelem; + + c = fm_printf(d + 1, c, cols, "[ "); + (void) nvpair_value_int32_array(nvp, &val, &nelem); + for (i = 0; i < nelem; i++) + c = fm_printf(d + 1, c, cols, "0x%llx ", + (u_longlong_t)val[i]); + + c = fm_printf(d + 1, c, cols, "]"); + break; + } + + case DATA_TYPE_UINT32_ARRAY: { + uint32_t *val; + uint_t i, nelem; + + c = fm_printf(d + 1, c, cols, "[ "); + (void) nvpair_value_uint32_array(nvp, &val, &nelem); + for (i = 0; i < nelem; i++) + c = fm_printf(d + 1, c, cols, "0x%llx ", + (u_longlong_t)val[i]); + + c = fm_printf(d + 1, c, cols, "]"); + break; + } + + case DATA_TYPE_INT64_ARRAY: { + int64_t *val; + uint_t i, nelem; + + c = fm_printf(d + 1, c, cols, "[ "); + (void) nvpair_value_int64_array(nvp, &val, &nelem); + for (i = 0; i < nelem; i++) + c = fm_printf(d + 1, c, cols, "0x%llx ", + (u_longlong_t)val[i]); + + c = fm_printf(d + 1, c, cols, "]"); + break; + } + + case DATA_TYPE_UINT64_ARRAY: { + uint64_t *val; + uint_t i, nelem; + + c = fm_printf(d + 1, c, cols, "[ "); + (void) nvpair_value_uint64_array(nvp, &val, &nelem); + for (i = 0; i < nelem; i++) + c = fm_printf(d + 1, c, cols, "0x%llx ", + (u_longlong_t)val[i]); + + c = fm_printf(d + 1, c, cols, "]"); + break; + } + + case DATA_TYPE_STRING_ARRAY: case DATA_TYPE_BOOLEAN_ARRAY: case DATA_TYPE_BYTE_ARRAY: - case DATA_TYPE_INT8_ARRAY: - case DATA_TYPE_UINT8_ARRAY: - case DATA_TYPE_INT16_ARRAY: - case DATA_TYPE_UINT16_ARRAY: - case DATA_TYPE_INT32_ARRAY: - case DATA_TYPE_UINT32_ARRAY: - case DATA_TYPE_INT64_ARRAY: - case DATA_TYPE_UINT64_ARRAY: - case DATA_TYPE_STRING_ARRAY: c = fm_printf(d + 1, c, cols, "[...]"); break; + case DATA_TYPE_UNKNOWN: c = fm_printf(d + 1, c, cols, ""); break; @@ -349,175 +400,211 @@ fm_nvprint(nvlist_t *nvl) char *class; int c = 0; - console_printf("\r"); + console_printf("\n"); if (nvlist_lookup_string(nvl, FM_CLASS, &class) == 0) - c = fm_printf(0, c, ereport_cols, "%s", class); + c = fm_printf(0, c, zevent_cols, "%s", class); - if (fm_nvprintr(nvl, 0, c, ereport_cols) != 0) + if (fm_nvprintr(nvl, 0, c, zevent_cols) != 0) console_printf("\n"); console_printf("\n"); } -/* - * Wrapper for panic() that first produces an FMA-style message for admins. - * Normally such messages are generated by fmd(1M)'s syslog-msgs agent: this - * is the one exception to that rule and the only error that gets messaged. - * This function is intended for use by subsystems that have detected a fatal - * error and enqueued appropriate ereports and wish to then force a panic. - */ -/*PRINTFLIKE1*/ -void -fm_panic(const char *format, ...) +static zevent_t * +fm_event_alloc(void) { - va_list ap; + zevent_t *ev; - (void) casptr((void *)&fm_panicstr, NULL, (void *)format); - va_start(ap, format); - vpanic(format, ap); - va_end(ap); + ev = kmem_zalloc(sizeof(zevent_t), KM_SLEEP); + if (ev == NULL) + return NULL; + + list_create(&ev->ev_zpd_list, sizeof(zfs_private_data_t), + offsetof(zfs_private_data_t, zpd_node)); + list_link_init(&ev->ev_node); + + return ev; +} + +static void +fm_event_free(zevent_t *ev) +{ + /* Run provided cleanup callback */ + ev->ev_cb(ev->ev_nvl); + + list_destroy(&ev->ev_zpd_list); + kmem_free(ev, sizeof(zevent_t)); +} + +static void +fm_zevent_drain(zevent_t *ev) +{ + zfs_private_data_t *zpd; + + ASSERT(MUTEX_HELD(&zevent_lock)); + list_remove(&zevent_list, ev); + + /* Remove references to this event in all private file data */ + while ((zpd = list_head(&ev->ev_zpd_list)) != NULL) { + list_remove(&ev->ev_zpd_list, zpd); + zpd->zpd_zevent = NULL; + zpd->zpd_dropped++; + } + + fm_event_free(ev); +} + +void +fm_zevent_drain_all(int *count) +{ + zevent_t *ev; + + mutex_enter(&zevent_lock); + while ((ev = list_head(&zevent_list)) != NULL) + fm_zevent_drain(ev); + + *count = zevent_len_cur; + zevent_len_cur = 0; + mutex_exit(&zevent_lock); } /* - * Print any appropriate FMA banner message before the panic message. This - * function is called by panicsys() and prints the message for fm_panic(). - * We print the message here so that it comes after the system is quiesced. - * A one-line summary is recorded in the log only (cmn_err(9F) with "!" prefix). - * The rest of the message is for the console only and not needed in the log, - * so it is printed using console_printf(). We break it up into multiple - * chunks so as to avoid overflowing any small legacy prom_printf() buffers. + * New zevents are inserted at the head. If the maximum queue + * length is exceeded a zevent will be drained from the tail. + * As part of this any user space processes which currently have + * a reference to this zevent_t in their private data will have + * this reference set to NULL. */ -void -fm_banner(void) +static void +fm_zevent_insert(zevent_t *ev) { - timespec_t tod; - hrtime_t now; + mutex_enter(&zevent_lock); + list_insert_head(&zevent_list, ev); + if (zevent_len_cur >= zevent_len_max) + fm_zevent_drain(list_tail(&zevent_list)); + else + zevent_len_cur++; - if (!fm_panicstr) - return; /* panic was not initiated by fm_panic(); do nothing */ - - if (panicstr) { - tod = panic_hrestime; - now = panic_hrtime; - } else { - gethrestime(&tod); - now = gethrtime_waitfree(); - } - - cmn_err(CE_NOTE, "!SUNW-MSG-ID: %s, " - "TYPE: Error, VER: 1, SEVERITY: Major\n", fm_msgid); - - console_printf( -"\n\rSUNW-MSG-ID: %s, TYPE: Error, VER: 1, SEVERITY: Major\n" -"EVENT-TIME: 0x%lx.0x%lx (0x%llx)\n", - fm_msgid, tod.tv_sec, tod.tv_nsec, (u_longlong_t)now); - - console_printf( -"PLATFORM: %s, CSN: -, HOSTNAME: %s\n" -"SOURCE: %s, REV: %s %s\n", - platform, utsname.nodename, utsname.sysname, - utsname.release, utsname.version); - - console_printf( -"DESC: Errors have been detected that require a reboot to ensure system\n" -"integrity. See %s/%s for more information.\n", - fm_url, fm_msgid); - - console_printf( -"AUTO-RESPONSE: Solaris will attempt to save and diagnose the error telemetry\n" -"IMPACT: The system will sync files, save a crash dump if needed, and reboot\n" -"REC-ACTION: Save the error summary below in case telemetry cannot be saved\n"); - - console_printf("\n"); + mutex_exit(&zevent_lock); } /* - * Utility function to write all of the pending ereports to the dump device. - * This function is called at either normal reboot or panic time, and simply - * iterates over the in-transit messages in the ereport sysevent channel. + * Post a zevent */ void -fm_ereport_dump(void) -{ - evchanq_t *chq; - sysevent_t *sep; - erpt_dump_t ed; - - timespec_t tod; - hrtime_t now; - char *buf; - size_t len; - - if (panicstr) { - tod = panic_hrestime; - now = panic_hrtime; - } else { - if (ereport_errorq != NULL) - errorq_drain(ereport_errorq); - gethrestime(&tod); - now = gethrtime_waitfree(); - } - - /* - * In the panic case, sysevent_evc_walk_init() will return NULL. - */ - if ((chq = sysevent_evc_walk_init(ereport_chan, NULL)) == NULL && - !panicstr) - return; /* event channel isn't initialized yet */ - - while ((sep = sysevent_evc_walk_step(chq)) != NULL) { - if ((buf = sysevent_evc_event_attr(sep, &len)) == NULL) - break; - - ed.ed_magic = ERPT_MAGIC; - ed.ed_chksum = checksum32(buf, len); - ed.ed_size = (uint32_t)len; - ed.ed_pad = 0; - ed.ed_hrt_nsec = SE_TIME(sep); - ed.ed_hrt_base = now; - ed.ed_tod_base.sec = tod.tv_sec; - ed.ed_tod_base.nsec = tod.tv_nsec; - - dumpvp_write(&ed, sizeof (ed)); - dumpvp_write(buf, len); - } - - sysevent_evc_walk_fini(chq); -} - -/* - * Post an error report (ereport) to the sysevent error channel. The error - * channel must be established with a prior call to sysevent_evc_create() - * before publication may occur. - */ -void -fm_ereport_post(nvlist_t *ereport, int evc_flag) +fm_zevent_post(nvlist_t *nvl, zevent_cb_t *cb) { size_t nvl_size = 0; - evchan_t *error_chan; + zevent_t *ev; - (void) nvlist_size(ereport, &nvl_size, NV_ENCODE_NATIVE); + (void) nvlist_size(nvl, &nvl_size, NV_ENCODE_NATIVE); if (nvl_size > ERPT_DATA_SZ || nvl_size == 0) { atomic_add_64(&erpt_kstat_data.erpt_dropped.value.ui64, 1); return; } - if (sysevent_evc_bind(FM_ERROR_CHAN, &error_chan, - EVCH_CREAT|EVCH_HOLD_PEND) != 0) { + if (zevent_console) + fm_nvprint(nvl); + + ev = fm_event_alloc(); + if (ev == NULL) { atomic_add_64(&erpt_kstat_data.erpt_dropped.value.ui64, 1); return; } - if (sysevent_evc_publish(error_chan, EC_FM, ESC_FM_ERROR, - SUNW_VENDOR, FM_PUB, ereport, evc_flag) != 0) { - atomic_add_64(&erpt_kstat_data.erpt_dropped.value.ui64, 1); - sysevent_evc_unbind(error_chan); - return; - } - sysevent_evc_unbind(error_chan); + ev->ev_nvl = nvl; + ev->ev_cb = cb; + fm_zevent_insert(ev); + cv_broadcast(&zevent_cv); } +/* + * Get the next zevent in the stream. To avoid making an extra copy of the + * nvlist we must call put_nvlist() here safely under the zevent_lock. + */ +int +fm_zevent_next(zfs_private_data_t *zpd, zfs_cmd_t *zc) +{ + zevent_t *ev; + int error; + + mutex_enter(&zevent_lock); + if (zpd->zpd_zevent == NULL) { + /* New stream start at the beginning/tail */ + ev = list_tail(&zevent_list); + if (ev == NULL) { + error = ENOENT; + goto out; + } + } else { + /* Existing stream continue with the next element and remove + * ourselves from the wait queue for the previous element */ + ev = list_prev(&zevent_list, zpd->zpd_zevent); + if (ev == NULL) { + error = ENOENT; + goto out; + } + + list_remove(&zpd->zpd_zevent->ev_zpd_list, zpd); + } + + zpd->zpd_zevent = ev; + list_insert_head(&ev->ev_zpd_list, zpd); + error = put_nvlist(zc, ev->ev_nvl); + zc->zc_cookie = zpd->zpd_dropped; + zpd->zpd_dropped = 0; +out: + mutex_exit(&zevent_lock); + + return error; +} + +int +fm_zevent_wait(zfs_private_data_t *zpd) +{ + int error = 0; + + mutex_enter(&zevent_lock); + + if (zevent_flags & ZEVENT_SHUTDOWN) { + error = ESHUTDOWN; + goto out; + } + + zevent_waiters++; + cv_wait_interruptible(&zevent_cv, &zevent_lock); + if (issig(JUSTLOOKING)) + error = EINTR; + + zevent_waiters--; +out: + mutex_exit(&zevent_lock); + + return error; +} + +void +fm_zevent_init(zfs_private_data_t *zpd) +{ + list_link_init(&zpd->zpd_node); + zpd->zpd_zevent = NULL; + zpd->zpd_dropped = 0; +} + +void +fm_zevent_fini(zfs_private_data_t *zpd) +{ + mutex_enter(&zevent_lock); + if (zpd->zpd_zevent) + list_remove(&zpd->zpd_zevent->ev_zpd_list, zpd); + + zpd->zpd_zevent = NULL; + zpd->zpd_dropped = 0; + mutex_exit(&zevent_lock); +} +#endif /* _KERNEL */ + /* * Wrapppers for FM nvlist allocators */ @@ -795,6 +882,8 @@ fm_ereport_set(nvlist_t *ereport, int version, const char *erpt_class, { char ereport_class[FM_MAX_CLASS]; const char *name; + timestruc_t tv; + int64_t tv_array[2]; va_list ap; int ret; @@ -826,6 +915,13 @@ fm_ereport_set(nvlist_t *ereport, int version, const char *erpt_class, if (ret) atomic_add_64(&erpt_kstat_data.erpt_set_failed.value.ui64, 1); + + gethrestime(&tv); + tv_array[0] = tv.tv_sec; + tv_array[1] = tv.tv_nsec; + if (nvlist_add_int64_array(ereport, FM_EREPORT_TIME, tv_array, 2)) { + atomic_add_64(&erpt_kstat_data.erpt_set_failed.value.ui64, 1); + } } /* @@ -1146,7 +1242,7 @@ fm_ena_generate_cpu(uint64_t timestamp, processorid_t cpuid, uchar_t format) ena = (uint64_t)((format & ENA_FORMAT_MASK) | ((cpuid << ENA_FMT1_CPUID_SHFT) & ENA_FMT1_CPUID_MASK) | - ((gethrtime_waitfree() << ENA_FMT1_TIME_SHFT) & + ((gethrtime() << ENA_FMT1_TIME_SHFT) & ENA_FMT1_TIME_MASK)); } break; @@ -1164,7 +1260,7 @@ fm_ena_generate_cpu(uint64_t timestamp, processorid_t cpuid, uchar_t format) uint64_t fm_ena_generate(uint64_t timestamp, uchar_t format) { - return (fm_ena_generate_cpu(timestamp, CPU->cpu_id, format)); + return (fm_ena_generate_cpu(timestamp, getcpuid(), format)); } uint64_t @@ -1232,35 +1328,67 @@ fm_ena_time_get(uint64_t ena) return (time); } -/* - * Convert a getpcstack() trace to symbolic name+offset, and add the resulting - * string array to a Fault Management ereport as FM_EREPORT_PAYLOAD_NAME_STACK. - */ +#ifdef _KERNEL void -fm_payload_stack_add(nvlist_t *payload, const pc_t *stack, int depth) +fm_init(void) { - int i; - char *sym; - ulong_t off; - char *stkpp[FM_STK_DEPTH]; - char buf[FM_STK_DEPTH * FM_SYM_SZ]; - char *stkp = buf; + zevent_len_cur = 0; + zevent_flags = 0; - for (i = 0; i < depth && i != FM_STK_DEPTH; i++, stkp += FM_SYM_SZ) { - if ((sym = kobj_getsymname(stack[i], &off)) != NULL) - (void) snprintf(stkp, FM_SYM_SZ, "%s+%lx", sym, off); - else - (void) snprintf(stkp, FM_SYM_SZ, "%lx", (long)stack[i]); - stkpp[i] = stkp; + if (zevent_len_max == 0) + zevent_len_max = ERPT_MAX_ERRS * MAX(max_ncpus, 4); + + /* Initialize zevent allocation and generation kstats */ + fm_ksp = kstat_create("zfs", 0, "fm", "misc", KSTAT_TYPE_NAMED, + sizeof (struct erpt_kstat) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + + if (fm_ksp != NULL) { + fm_ksp->ks_data = &erpt_kstat_data; + kstat_install(fm_ksp); + } else { + cmn_err(CE_NOTE, "failed to create fm/misc kstat\n"); } - fm_payload_set(payload, FM_EREPORT_PAYLOAD_NAME_STACK, - DATA_TYPE_STRING_ARRAY, depth, stkpp, NULL); + mutex_init(&zevent_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&zevent_list, sizeof(zevent_t), offsetof(zevent_t, ev_node)); + cv_init(&zevent_cv, NULL, CV_DEFAULT, NULL); } void -print_msg_hwerr(ctid_t ct_id, proc_t *p) +fm_fini(void) { - uprintf("Killed process %d (%s) in contract id %d " - "due to hardware error\n", p->p_pid, p->p_user.u_comm, ct_id); + int count; + + fm_zevent_drain_all(&count); + cv_broadcast(&zevent_cv); + + mutex_enter(&zevent_lock); + zevent_flags |= ZEVENT_SHUTDOWN; + while (zevent_waiters > 0) { + mutex_exit(&zevent_lock); + schedule(); + mutex_enter(&zevent_lock); + } + mutex_exit(&zevent_lock); + + cv_destroy(&zevent_cv); + list_destroy(&zevent_list); + mutex_destroy(&zevent_lock); + + if (fm_ksp != NULL) { + kstat_delete(fm_ksp); + fm_ksp = NULL; + } } + +module_param(zevent_len_max, int, 0644); +MODULE_PARM_DESC(zevent_len_max, "Maximum event queue length"); + +module_param(zevent_cols, int, 0644); +MODULE_PARM_DESC(zevent_cols, "Maximum event column width"); + +module_param(zevent_console, int, 0644); +MODULE_PARM_DESC(zevent_console, "Log events to the console"); + +#endif /* _KERNEL */ diff --git a/module/zfs/include/sys/fm/fs/zfs.h b/module/zfs/include/sys/fm/fs/zfs.h index 21b7dbe52c..5bce5346e7 100644 --- a/module/zfs/include/sys/fm/fs/zfs.h +++ b/module/zfs/include/sys/fm/fs/zfs.h @@ -35,7 +35,9 @@ extern "C" { #define FM_EREPORT_ZFS_CHECKSUM "checksum" #define FM_EREPORT_ZFS_IO "io" #define FM_EREPORT_ZFS_DATA "data" +#define FM_EREPORT_ZFS_CONFIG_SYNC "config.sync" #define FM_EREPORT_ZFS_POOL "zpool" +#define FM_EREPORT_ZFS_POOL_DESTROY "zpool.destroy" #define FM_EREPORT_ZFS_DEVICE_UNKNOWN "vdev.unknown" #define FM_EREPORT_ZFS_DEVICE_OPEN_FAILED "vdev.open_failed" #define FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA "vdev.corrupt_data" @@ -43,9 +45,18 @@ extern "C" { #define FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM "vdev.bad_guid_sum" #define FM_EREPORT_ZFS_DEVICE_TOO_SMALL "vdev.too_small" #define FM_EREPORT_ZFS_DEVICE_BAD_LABEL "vdev.bad_label" +#define FM_EREPORT_ZFS_DEVICE_REMOVE "vdev.remove" +#define FM_EREPORT_ZFS_DEVICE_CLEAR "vdev.clear" +#define FM_EREPORT_ZFS_DEVICE_CHECK "vdev.check" +#define FM_EREPORT_ZFS_DEVICE_SPARE "vdev.spare" +#define FM_EREPORT_ZFS_DEVICE_AUTOEXPAND "vdev.autoexpand" #define FM_EREPORT_ZFS_IO_FAILURE "io_failure" #define FM_EREPORT_ZFS_PROBE_FAILURE "probe_failure" #define FM_EREPORT_ZFS_LOG_REPLAY "log_replay" +#define FM_EREPORT_ZFS_RESILVER_START "resilver.start" +#define FM_EREPORT_ZFS_RESILVER_FINISH "resilver.finish" +#define FM_EREPORT_ZFS_SCRUB_START "scrub.start" +#define FM_EREPORT_ZFS_SCRUB_FINISH "scrub.finish" #define FM_EREPORT_PAYLOAD_ZFS_POOL "pool" #define FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE "pool_failmode" @@ -73,8 +84,8 @@ extern "C" { #define FM_EREPORT_FAILMODE_CONTINUE "continue" #define FM_EREPORT_FAILMODE_PANIC "panic" -#define FM_RESOURCE_REMOVED "removed" -#define FM_RESOURCE_AUTOREPLACE "autoreplace" +#define FM_EREPORT_RESOURCE_REMOVED "removed" +#define FM_EREPORT_RESOURCE_AUTOREPLACE "autoreplace" #ifdef __cplusplus } diff --git a/module/zfs/include/sys/fm/protocol.h b/module/zfs/include/sys/fm/protocol.h index 767fb07d81..70c3c93fac 100644 --- a/module/zfs/include/sys/fm/protocol.h +++ b/module/zfs/include/sys/fm/protocol.h @@ -68,6 +68,7 @@ extern "C" { /* ereport payload member names */ #define FM_EREPORT_DETECTOR "detector" #define FM_EREPORT_ENA "ena" +#define FM_EREPORT_TIME "time" /* list.* event payload member names */ #define FM_LIST_EVENT_SIZE "list-sz" @@ -295,15 +296,13 @@ extern "C" { #define FM_FMRI_ZFS_POOL "pool" #define FM_FMRI_ZFS_VDEV "vdev" -extern nv_alloc_t *fm_nva_xcreate(char *, size_t); -extern void fm_nva_xdestroy(nv_alloc_t *); - -extern nvlist_t *fm_nvlist_create(nv_alloc_t *); -extern void fm_nvlist_destroy(nvlist_t *, int); - #define FM_NVA_FREE 0 /* free allocator on nvlist_destroy */ #define FM_NVA_RETAIN 1 /* keep allocator on nvlist_destroy */ +extern nv_alloc_t *fm_nva_xcreate(char *, size_t); +extern void fm_nva_xdestroy(nv_alloc_t *); +extern nvlist_t *fm_nvlist_create(nv_alloc_t *); +extern void fm_nvlist_destroy(nvlist_t *, int); extern void fm_ereport_set(nvlist_t *, int, const char *, uint64_t, const nvlist_t *, ...); extern void fm_payload_set(nvlist_t *, ...); @@ -312,15 +311,11 @@ extern void fm_fmri_hc_set(nvlist_t *, int, const nvlist_t *, nvlist_t *, int, ...); extern void fm_fmri_dev_set(nvlist_t *, int, const nvlist_t *, const char *, const char *); -extern void fm_fmri_de_set(nvlist_t *, int, const nvlist_t *, const char *); extern void fm_fmri_cpu_set(nvlist_t *, int, const nvlist_t *, uint32_t, uint8_t *, const char *); extern void fm_fmri_mem_set(nvlist_t *, int, const nvlist_t *, const char *, const char *, uint64_t); -extern void fm_authority_set(nvlist_t *, int, const char *, const char *, - const char *, const char *); extern void fm_fmri_zfs_set(nvlist_t *, int, uint64_t, uint64_t); - extern uint64_t fm_ena_increment(uint64_t); extern uint64_t fm_ena_generate(uint64_t, uchar_t); extern uint64_t fm_ena_generate_cpu(uint64_t, processorid_t, uchar_t); diff --git a/module/zfs/include/sys/fm/util.h b/module/zfs/include/sys/fm/util.h index 4e19e4de09..2052e1998e 100644 --- a/module/zfs/include/sys/fm/util.h +++ b/module/zfs/include/sys/fm/util.h @@ -34,7 +34,6 @@ extern "C" { #endif #include -#include /* * Shared user/kernel definitions for class length, error channel name, @@ -74,27 +73,41 @@ typedef struct erpt_dump { } erpt_dump_t; #ifdef _KERNEL + #include +#include -#define FM_STK_DEPTH 20 /* maximum stack depth */ -#define FM_SYM_SZ 64 /* maximum symbol size */ -#define FM_ERR_PIL 2 /* PIL for ereport_errorq drain processing */ +#define ZEVENT_SHUTDOWN 0x1 -#define FM_EREPORT_PAYLOAD_NAME_STACK "stack" +typedef void zevent_cb_t(nvlist_t *); -extern errorq_t *ereport_errorq; -extern void *ereport_dumpbuf; -extern size_t ereport_dumplen; +typedef struct zevent_s { + nvlist_t *ev_nvl; /* protected by the zevent_lock */ + list_t ev_zpd_list; /* " */ + list_node_t ev_node; /* " */ + zevent_cb_t *ev_cb; /* " */ +} zevent_t; + +typedef struct zfs_private_data { + zevent_t *zpd_zevent; /* protected by the zevent_lock */ + list_node_t zpd_node; /* " */ + uint64_t zpd_dropped; /* " */ +} zfs_private_data_t; extern void fm_init(void); +extern void fm_fini(void); extern void fm_nvprint(nvlist_t *); -extern void fm_panic(const char *, ...); -extern void fm_banner(void); +extern void fm_zevent_init(zfs_private_data_t *); +extern void fm_zevent_fini(zfs_private_data_t *); +extern void fm_zevent_post(nvlist_t *, zevent_cb_t *); +extern void fm_zevent_drain_all(int *); +extern int fm_zevent_next(zfs_private_data_t *, zfs_cmd_t *); +extern int fm_zevent_wait(zfs_private_data_t *); -extern void fm_ereport_dump(void); -extern void fm_ereport_post(nvlist_t *, int); +#else -extern void fm_payload_stack_add(nvlist_t *, const pc_t *, int); +static inline void fm_init(void) { } +static inline void fm_fini(void) { } #endif /* _KERNEL */ diff --git a/module/zfs/include/sys/spa.h b/module/zfs/include/sys/spa.h index 30554ae0ec..68af574b55 100644 --- a/module/zfs/include/sys/spa.h +++ b/module/zfs/include/sys/spa.h @@ -510,8 +510,6 @@ struct zio; extern void spa_log_error(spa_t *spa, struct zio *zio); extern void zfs_ereport_post(const char *class, spa_t *spa, vdev_t *vd, struct zio *zio, uint64_t stateoroffset, uint64_t length); -extern void zfs_post_remove(spa_t *spa, vdev_t *vd); -extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd); extern uint64_t spa_get_errlog_size(spa_t *spa); extern int spa_get_errlog(spa_t *spa, void *uaddr, size_t *count); extern void spa_errlog_rotate(spa_t *spa); diff --git a/module/zfs/include/sys/zfs_context.h b/module/zfs/include/sys/zfs_context.h index 40de32084d..6f8fa3c6a2 100644 --- a/module/zfs/include/sys/zfs_context.h +++ b/module/zfs/include/sys/zfs_context.h @@ -58,10 +58,7 @@ extern "C" { #include #include #include -#include -#include -#include -#include +#include #define CPU_SEQID (CPU->cpu_seqid) diff --git a/module/zfs/spa.c b/module/zfs/spa.c index d147b8e910..ba6f6d45cc 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1098,8 +1098,9 @@ spa_check_removed(vdev_t *vd) spa_check_removed(vd->vdev_child[c]); if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) { - zfs_post_autoreplace(vd->vdev_spa, vd); - spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); + zfs_ereport_post(FM_EREPORT_RESOURCE_AUTOREPLACE, + vd->vdev_spa, vd, NULL, 0, 0); + spa_event_notify(vd->vdev_spa, vd, FM_EREPORT_ZFS_DEVICE_CHECK); } } @@ -2848,7 +2849,7 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, } } - spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); + spa_event_notify(spa, NULL, FM_EREPORT_ZFS_POOL_DESTROY); if (spa->spa_state != POOL_STATE_UNINITIALIZED) { spa_unload(spa); @@ -3158,7 +3159,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) if (newvd->vdev_isspare) { spa_spare_activate(newvd); - spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE); + spa_event_notify(spa, newvd, FM_EREPORT_ZFS_DEVICE_SPARE); } oldvdpath = spa_strdup(oldvd->vdev_path); @@ -3376,7 +3377,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) vd->vdev_detached = B_TRUE; vdev_dirty(tvd, VDD_DTL, vd, txg); - spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); + spa_event_notify(spa, vd, FM_EREPORT_ZFS_DEVICE_REMOVE); error = spa_vdev_exit(spa, vd, txg, 0); @@ -3718,9 +3719,6 @@ spa_async_probe(spa_t *spa, vdev_t *vd) static void spa_async_autoexpand(spa_t *spa, vdev_t *vd) { - sysevent_id_t eid; - nvlist_t *attr; - char *physpath; int c; if (!spa->spa_autoexpand) @@ -3734,17 +3732,7 @@ spa_async_autoexpand(spa_t *spa, vdev_t *vd) if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) return; - physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); - (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath); - - VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); - VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); - - (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, - ESC_DEV_DLE, attr, &eid, DDI_SLEEP); - - nvlist_free(attr); - kmem_free(physpath, MAXPATHLEN); + spa_event_notify(vd->vdev_spa, vd, FM_EREPORT_ZFS_DEVICE_AUTOEXPAND); } static void @@ -4508,8 +4496,7 @@ spa_has_active_shared_spare(spa_t *spa) } /* - * Post a sysevent corresponding to the given event. The 'name' must be one of - * the event definitions in sys/sysevent/eventdefs.h. The payload will be + * Post a FM_EREPORT_ZFS_* event from sys/fm/fs/zfs.h. The payload will be * filled in from the spa and (optionally) the vdev. This doesn't do anything * in the userland libzpool, as we don't want consumers to misinterpret ztest * or zdb as real changes. @@ -4518,49 +4505,6 @@ void spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) { #ifdef _KERNEL - sysevent_t *ev; - sysevent_attr_list_t *attr = NULL; - sysevent_value_t value; - sysevent_id_t eid; - - ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", - SE_SLEEP); - - value.value_type = SE_DATA_TYPE_STRING; - value.value.sv_string = spa_name(spa); - if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) - goto done; - - value.value_type = SE_DATA_TYPE_UINT64; - value.value.sv_uint64 = spa_guid(spa); - if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) - goto done; - - if (vd) { - value.value_type = SE_DATA_TYPE_UINT64; - value.value.sv_uint64 = vd->vdev_guid; - if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, - SE_SLEEP) != 0) - goto done; - - if (vd->vdev_path) { - value.value_type = SE_DATA_TYPE_STRING; - value.value.sv_string = vd->vdev_path; - if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, - &value, SE_SLEEP) != 0) - goto done; - } - } - - if (sysevent_attach_attributes(ev, attr) != 0) - goto done; - attr = NULL; - - (void) log_sysevent(ev, SE_SLEEP, &eid); - -done: - if (attr) - sysevent_free_attr(attr); - sysevent_free(ev); + zfs_ereport_post(name, spa, vd, NULL, 0, 0); #endif } diff --git a/module/zfs/spa_config.c b/module/zfs/spa_config.c index b2063bba13..147aebae20 100644 --- a/module/zfs/spa_config.c +++ b/module/zfs/spa_config.c @@ -267,7 +267,7 @@ spa_config_sync(spa_t *target, boolean_t removing, boolean_t postsysevent) spa_config_generation++; if (postsysevent) - spa_event_notify(target, NULL, ESC_ZFS_CONFIG_SYNC); + spa_event_notify(target, NULL, FM_EREPORT_ZFS_CONFIG_SYNC); } /* diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index ef74a443de..f8a2e6102f 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -1374,6 +1375,7 @@ spa_init(int mode) spa_mode_global = mode; + fm_init(); refcount_init(); unique_init(); zio_init(); @@ -1399,6 +1401,7 @@ spa_fini(void) zio_fini(); unique_fini(); refcount_fini(); + fm_fini(); avl_destroy(&spa_namespace_avl); avl_destroy(&spa_spare_avl); diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 57869b6e6e..f51e955f39 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -2097,7 +2097,7 @@ vdev_clear(spa_t *spa, vdev_t *vd) if (vd->vdev_aux == NULL && !vdev_is_dead(vd)) spa_async_request(spa, SPA_ASYNC_RESILVER); - spa_event_notify(spa, vd, ESC_ZFS_VDEV_CLEAR); + spa_event_notify(spa, vd, FM_EREPORT_ZFS_DEVICE_CLEAR); } } @@ -2634,7 +2634,8 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) * Indicate to the ZFS DE that this device has been removed, and * any recent errors should be ignored. */ - zfs_post_remove(spa, vd); + zfs_ereport_post(FM_EREPORT_RESOURCE_REMOVED, + spa, vd, NULL, 0, 0); vd->vdev_removed = B_TRUE; } else if (state == VDEV_STATE_CANT_OPEN) { /* diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c index 8b7785fa83..f2110feabc 100644 --- a/module/zfs/zfs_fm.c +++ b/module/zfs/zfs_fm.c @@ -88,6 +88,14 @@ * doesn't actually correspond to any particular device or piece of data, * and the caller will always retry without caching or queueing anyway). */ +#ifdef _KERNEL +static void +zfs_ereport_post_cb(nvlist_t *nvl) +{ + fm_nvlist_destroy(nvl, FM_NVA_FREE); +} +#endif /* _KERNEL */ + void zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, uint64_t stateoroffset, uint64_t size) @@ -205,6 +213,7 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, vd != NULL ? vd->vdev_guid : 0); fm_ereport_set(ereport, FM_EREPORT_VERSION, class, ena, detector, NULL); + fm_nvlist_destroy(detector, FM_NVA_FREE); /* * Construct the per-ereport payload, depending on which parameters are @@ -324,58 +333,11 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, } mutex_exit(&spa->spa_errlist_lock); - fm_ereport_post(ereport, EVCH_SLEEP); - - fm_nvlist_destroy(ereport, FM_NVA_FREE); - fm_nvlist_destroy(detector, FM_NVA_FREE); -#endif + /* Cleanup must be handled by the passed callback function */ + fm_zevent_post(ereport, zfs_ereport_post_cb); +#endif /* _KERNEL */ } -static void -zfs_post_common(spa_t *spa, vdev_t *vd, const char *name) -{ -#ifdef _KERNEL - nvlist_t *resource; - char class[64]; - - if ((resource = fm_nvlist_create(NULL)) == NULL) - return; - - (void) snprintf(class, sizeof (class), "%s.%s.%s", FM_RSRC_RESOURCE, - ZFS_ERROR_CLASS, name); - VERIFY(nvlist_add_uint8(resource, FM_VERSION, FM_RSRC_VERSION) == 0); - VERIFY(nvlist_add_string(resource, FM_CLASS, class) == 0); - VERIFY(nvlist_add_uint64(resource, - FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, spa_guid(spa)) == 0); - if (vd) - VERIFY(nvlist_add_uint64(resource, - FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vd->vdev_guid) == 0); - - fm_ereport_post(resource, EVCH_SLEEP); - - fm_nvlist_destroy(resource, FM_NVA_FREE); -#endif -} - -/* - * The 'resource.fs.zfs.removed' event is an internal signal that the given vdev - * has been removed from the system. This will cause the DE to ignore any - * recent I/O errors, inferring that they are due to the asynchronous device - * removal. - */ -void -zfs_post_remove(spa_t *spa, vdev_t *vd) -{ - zfs_post_common(spa, vd, FM_RESOURCE_REMOVED); -} - -/* - * The 'resource.fs.zfs.autoreplace' event is an internal signal that the pool - * has the 'autoreplace' property set, and therefore any broken vdevs will be - * handled by higher level logic, and no vdev fault should be generated. - */ -void -zfs_post_autoreplace(spa_t *spa, vdev_t *vd) -{ - zfs_post_common(spa, vd, FM_RESOURCE_AUTOREPLACE); -} +#if defined(_KERNEL) && defined(HAVE_SPL) +EXPORT_SYMBOL(zfs_ereport_post); +#endif /* _KERNEL */ From e42734b565f67a477258c1a1daf7a25f96537547 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 14 May 2010 12:51:37 -0700 Subject: [PATCH 3/3] New TopGit dependency: linux-events --- .topdeps | 1 + 1 file changed, 1 insertion(+) diff --git a/.topdeps b/.topdeps index 8d375b965f..4d67065aef 100644 --- a/.topdeps +++ b/.topdeps @@ -12,3 +12,4 @@ linux-unused linux-ztest linux-have-idmap linux-kernel-device +linux-events