More fixes to ensure we get good debug logs even if we're in the

process of destroying the stacks.  Threshhold set fairly aggressively
top 80% of stack usage.



git-svn-id: https://outreach.scidac.gov/svn/spl/trunk@82 7e1ea52c-4ff2-0310-8f11-9dd32ca42a1c
This commit is contained in:
behlendo 2008-04-21 22:44:11 +00:00
parent e5bbd245e3
commit 7fea96c04f
3 changed files with 63 additions and 53 deletions

View File

@ -64,6 +64,16 @@ extern unsigned int spl_debug_stack;
#define SPL_DEFAULT_MIN_DELAY ((HZ + 1) / 2) #define SPL_DEFAULT_MIN_DELAY ((HZ + 1) / 2)
#define SPL_DEFAULT_BACKOFF 2 #define SPL_DEFAULT_BACKOFF 2
#define DL_NOTHREAD 0x0001 /* Do not create a new thread */
#define DL_SINGLE_CPU 0x0002 /* Collect pages from this CPU */
typedef struct dumplog_priv {
wait_queue_head_t dp_waitq;
pid_t dp_pid;
int dp_flags;
atomic_t dp_done;
} dumplog_priv_t;
typedef struct { typedef struct {
unsigned long cdls_next; unsigned long cdls_next;
int cdls_count; int cdls_count;
@ -147,7 +157,7 @@ struct page_collection {
int pc_want_daemon_pages; int pc_want_daemon_pages;
}; };
#define SBUG() spl_debug_bug(__FILE__, __FUNCTION__, __LINE__); #define SBUG() spl_debug_bug(__FILE__, __FUNCTION__, __LINE__, 0);
#ifdef __ia64__ #ifdef __ia64__
#define CDEBUG_STACK() (THREAD_SIZE - \ #define CDEBUG_STACK() (THREAD_SIZE - \
@ -159,29 +169,24 @@ struct page_collection {
(THREAD_SIZE - 1))) (THREAD_SIZE - 1)))
# endif /* __ia64__ */ # endif /* __ia64__ */
/* DL_NOTHREAD and DL_SINGLE_CPU flags are passed to spl_debug_bug()
* because we have over run our stack and likely damaged at least one
* other unknown threads stack. We must finish generating the needed
* debug info within this thread context because once we yeild the CPU
* its very likely the system will crash.
*/
#define __CHECK_STACK(file, func, line) \ #define __CHECK_STACK(file, func, line) \
do { \ do { \
unsigned long _stack = CDEBUG_STACK(); \ unsigned long _stack = CDEBUG_STACK(); \
unsigned long _soft_limit = (9 * THREAD_SIZE) / 10; \ unsigned long _soft_limit = (8 * THREAD_SIZE) / 10; \
\ \
if (unlikely(_stack > _soft_limit && _stack > spl_debug_stack)){\ if (unlikely(_stack > _soft_limit && _stack > spl_debug_stack)){\
spl_debug_stack = _stack; \ spl_debug_stack = _stack; \
if (_stack <= THREAD_SIZE) { \
spl_debug_msg(NULL, D_TRACE, D_WARNING, \ spl_debug_msg(NULL, D_TRACE, D_WARNING, \
file, func, line, "Warning " \ file, func, line, "Error exceeded " \
"exceeded 90%% of maximum safe " \ "maximum safe stack size (%lu/%lu)\n", \
"stack size (%lu/%lu)\n", \
_stack, THREAD_SIZE); \ _stack, THREAD_SIZE); \
spl_debug_dumpstack(NULL); \ spl_debug_bug(file, func, line, DL_SINGLE_CPU); \
spl_debug_dumplog(); \
} else { \
spl_debug_msg(NULL, D_TRACE, D_WARNING, \
file, func, line, "Error " \
"exceeded maximum safe stack " \
"size (%lu/%lu)\n", \
_stack, THREAD_SIZE); \
SBUG(); \
} \
} \ } \
} while (0) } while (0)
@ -213,7 +218,7 @@ do { \
spl_debug_msg(NULL, DEBUG_SUBSYSTEM, D_EMERG, \ spl_debug_msg(NULL, DEBUG_SUBSYSTEM, D_EMERG, \
__FILE__, __FUNCTION__, __LINE__, \ __FILE__, __FUNCTION__, __LINE__, \
"ASSERTION(" #cond ") failed\n"); \ "ASSERTION(" #cond ") failed\n"); \
spl_debug_bug(__FILE__, __FUNCTION__, __LINE__); \ SBUG(); \
} \ } \
} while (0) } while (0)
@ -226,7 +231,7 @@ do { \
__FILE__, __FUNCTION__, __LINE__, \ __FILE__, __FUNCTION__, __LINE__, \
"ASSERTION(" #cond ") failed:" fmt, \ "ASSERTION(" #cond ") failed:" fmt, \
## a); \ ## a); \
spl_debug_bug(__FILE__, __FUNCTION__, __LINE__) \ SBUG(); \
} \ } \
} while (0) } while (0)
@ -242,7 +247,7 @@ do { \
__FILE__, __FUNCTION__, __LINE__, \ __FILE__, __FUNCTION__, __LINE__, \
"VERIFY3(" FMT " " #OP " " FMT ")\n", \ "VERIFY3(" FMT " " #OP " " FMT ")\n", \
CAST __left, CAST __right); \ CAST __left, CAST __right); \
spl_debug_bug(__FILE__, __FUNCTION__, __LINE__); \ SBUG(); \
} \ } \
} while (0) } while (0)
@ -285,7 +290,6 @@ do { \
#define CDEBUG_LIMIT(mask, format, a...) \ #define CDEBUG_LIMIT(mask, format, a...) \
__CDEBUG_LIMIT(DEBUG_SUBSYSTEM, mask, format, ## a) __CDEBUG_LIMIT(DEBUG_SUBSYSTEM, mask, format, ## a)
#define dprintf(fmt, a...) CDEBUG_LIMIT(D_INFO, fmt, ## a)
#define CWARN(fmt, a...) CDEBUG_LIMIT(D_WARNING, fmt, ## a) #define CWARN(fmt, a...) CDEBUG_LIMIT(D_WARNING, fmt, ## a)
#define CERROR(fmt, a...) CDEBUG_LIMIT(D_ERROR, fmt, ## a) #define CERROR(fmt, a...) CDEBUG_LIMIT(D_ERROR, fmt, ## a)
#define CEMERG(fmt, a...) CDEBUG_LIMIT(D_EMERG, fmt, ## a) #define CEMERG(fmt, a...) CDEBUG_LIMIT(D_EMERG, fmt, ## a)
@ -329,9 +333,9 @@ extern unsigned long spl_debug_get_subsys(void);
extern int spl_debug_set_mb(int mb); extern int spl_debug_set_mb(int mb);
extern int spl_debug_get_mb(void); extern int spl_debug_get_mb(void);
extern int spl_debug_dumplog(void); extern int spl_debug_dumplog(int flags);
extern void spl_debug_dumpstack(struct task_struct *tsk); extern void spl_debug_dumpstack(struct task_struct *tsk);
extern void spl_debug_bug(char *file, const char *func, const int line); extern void spl_debug_bug(char *file, const char *func, const int line, int flags);
extern int spl_debug_clear_buffer(void); extern int spl_debug_clear_buffer(void);
extern int spl_debug_mark_buffer(char *text); extern int spl_debug_mark_buffer(char *text);

View File

@ -102,7 +102,7 @@ struct rw_semaphore trace_sem;
atomic_t trace_tage_allocated = ATOMIC_INIT(0); atomic_t trace_tage_allocated = ATOMIC_INIT(0);
static int panic_notifier(struct notifier_block *, unsigned long, void *); static int panic_notifier(struct notifier_block *, unsigned long, void *);
static int spl_debug_dump_all_pages(char *); static int spl_debug_dump_all_pages(dumplog_priv_t *dp, char *);
static void trace_fini(void); static void trace_fini(void);
@ -344,12 +344,6 @@ spl_debug_str2mask(unsigned long *mask, const char *str, int is_subsys)
return 0; return 0;
} }
typedef struct dumplog_priv {
wait_queue_head_t dp_waitq;
pid_t dp_pid;
atomic_t dp_flag;
} dumplog_priv_t;
static void static void
spl_debug_dumplog_internal(dumplog_priv_t *dp) spl_debug_dumplog_internal(dumplog_priv_t *dp)
{ {
@ -362,7 +356,7 @@ spl_debug_dumplog_internal(dumplog_priv_t *dp)
"%s.%ld.%ld", spl_debug_file_path, "%s.%ld.%ld", spl_debug_file_path,
get_seconds(), (long)dp->dp_pid); get_seconds(), (long)dp->dp_pid);
printk(KERN_ALERT "SPL: dumping log to %s\n", spl_debug_file_name); printk(KERN_ALERT "SPL: dumping log to %s\n", spl_debug_file_name);
spl_debug_dump_all_pages(spl_debug_file_name); spl_debug_dump_all_pages(dp, spl_debug_file_name);
current->journal_info = journal_info; current->journal_info = journal_info;
} }
@ -373,29 +367,36 @@ spl_debug_dumplog_thread(void *arg)
dumplog_priv_t *dp = (dumplog_priv_t *)arg; dumplog_priv_t *dp = (dumplog_priv_t *)arg;
spl_debug_dumplog_internal(dp); spl_debug_dumplog_internal(dp);
atomic_set(&dp->dp_flag, 1); atomic_set(&dp->dp_done, 1);
wake_up(&dp->dp_waitq); wake_up(&dp->dp_waitq);
do_exit(0); do_exit(0);
return 0; /* Unreachable */ return 0; /* Unreachable */
} }
/* When flag is set do not use a new thread for the debug dump */
int int
spl_debug_dumplog(void) spl_debug_dumplog(int flags)
{ {
struct task_struct *tsk; struct task_struct *tsk;
dumplog_priv_t dp; dumplog_priv_t dp;
init_waitqueue_head(&dp.dp_waitq); init_waitqueue_head(&dp.dp_waitq);
dp.dp_pid = current->pid; dp.dp_pid = current->pid;
atomic_set(&dp.dp_flag, 0); dp.dp_flags = flags;
atomic_set(&dp.dp_done, 0);
if (dp.dp_flags & DL_NOTHREAD) {
spl_debug_dumplog_internal(&dp);
} else {
tsk = kthread_create(spl_debug_dumplog_thread,(void *)&dp,"spl_debug"); tsk = kthread_create(spl_debug_dumplog_thread,(void *)&dp,"spl_debug");
if (tsk == NULL) if (tsk == NULL)
return -ENOMEM; return -ENOMEM;
wake_up_process(tsk); wake_up_process(tsk);
wait_event(dp.dp_waitq, atomic_read(&dp.dp_flag)); wait_event(dp.dp_waitq, atomic_read(&dp.dp_done));
}
return 0; return 0;
} }
@ -849,7 +850,7 @@ EXPORT_SYMBOL(spl_debug_vmsg);
* some arch, this will have to be implemented separately in each arch. * some arch, this will have to be implemented separately in each arch.
*/ */
static void static void
panic_collect_pages(struct page_collection *pc) collect_pages_from_single_cpu(struct page_collection *pc)
{ {
struct trace_cpu_data *tcd; struct trace_cpu_data *tcd;
int i, j; int i, j;
@ -876,12 +877,12 @@ collect_pages_on_cpu(void *info)
} }
static void static void
collect_pages(struct page_collection *pc) collect_pages(dumplog_priv_t *dp, struct page_collection *pc)
{ {
INIT_LIST_HEAD(&pc->pc_pages); INIT_LIST_HEAD(&pc->pc_pages);
if (spl_panic_in_progress) if (spl_panic_in_progress || dp->dp_flags & DL_SINGLE_CPU)
panic_collect_pages(pc); collect_pages_from_single_cpu(pc);
else else
trace_call_on_all_cpus(collect_pages_on_cpu, pc); trace_call_on_all_cpus(collect_pages_on_cpu, pc);
} }
@ -944,7 +945,7 @@ trace_filp_open (const char *name, int flags, int mode, int *err)
#define trace_filp_poff(f) (&(f)->f_pos) #define trace_filp_poff(f) (&(f)->f_pos)
static int static int
spl_debug_dump_all_pages(char *filename) spl_debug_dump_all_pages(dumplog_priv_t *dp, char *filename)
{ {
struct page_collection pc; struct page_collection pc;
struct file *filp; struct file *filp;
@ -965,7 +966,7 @@ spl_debug_dump_all_pages(char *filename)
} }
spin_lock_init(&pc.pc_lock); spin_lock_init(&pc.pc_lock);
collect_pages(&pc); collect_pages(dp, &pc);
if (list_empty(&pc.pc_pages)) { if (list_empty(&pc.pc_pages)) {
rc = 0; rc = 0;
goto close; goto close;
@ -1006,13 +1007,18 @@ spl_debug_dump_all_pages(char *filename)
static void static void
spl_debug_flush_pages(void) spl_debug_flush_pages(void)
{ {
dumplog_priv_t dp;
struct page_collection pc; struct page_collection pc;
struct trace_page *tage; struct trace_page *tage;
struct trace_page *tmp; struct trace_page *tmp;
spin_lock_init(&pc.pc_lock); spin_lock_init(&pc.pc_lock);
init_waitqueue_head(&dp.dp_waitq);
dp.dp_pid = current->pid;
dp.dp_flags = 0;
atomic_set(&dp.dp_done, 0);
collect_pages(&pc); collect_pages(&dp, &pc);
list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) { list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) {
__ASSERT_TAGE_INVARIANT(tage); __ASSERT_TAGE_INVARIANT(tage);
list_del(&tage->linkage); list_del(&tage->linkage);
@ -1109,7 +1115,7 @@ void spl_debug_dumpstack(struct task_struct *tsk)
} }
EXPORT_SYMBOL(spl_debug_dumpstack); EXPORT_SYMBOL(spl_debug_dumpstack);
void spl_debug_bug(char *file, const char *func, const int line) void spl_debug_bug(char *file, const char *func, const int line, int flags)
{ {
spl_debug_catastrophe = 1; spl_debug_catastrophe = 1;
spl_debug_msg(NULL, 0, D_EMERG, file, func, line, "SBUG\n"); spl_debug_msg(NULL, 0, D_EMERG, file, func, line, "SBUG\n");
@ -1124,7 +1130,7 @@ void spl_debug_bug(char *file, const char *func, const int line)
spl_panic_in_progress = 1; spl_panic_in_progress = 1;
spl_debug_dumpstack(NULL); spl_debug_dumpstack(NULL);
spl_debug_dumplog(); spl_debug_dumplog(flags);
if (spl_debug_panic_on_bug) if (spl_debug_panic_on_bug)
panic("SBUG"); panic("SBUG");
@ -1168,7 +1174,7 @@ panic_notifier(struct notifier_block *self,
while (current->lock_depth >= 0) while (current->lock_depth >= 0)
unlock_kernel(); unlock_kernel();
spl_debug_dumplog_internal((void *)(long)current->pid); spl_debug_dumplog(DL_NOTHREAD | DL_SINGLE_CPU);
} }
return 0; return 0;

View File

@ -179,7 +179,7 @@ proc_dump_kernel(struct ctl_table *table, int write, struct file *filp,
ENTRY; ENTRY;
if (write) { if (write) {
spl_debug_dumplog(); spl_debug_dumplog(0);
*ppos += *lenp; *ppos += *lenp;
} else { } else {
*lenp = 0; *lenp = 0;