ZTS: Add a failsafe callback to run after each test

Tests that get killed do not have an opportunity to clean up.

There are many bad states this can leave the system in, but of
particular gravity is when zinject has been used to induce bad
behavior for one or more of the test disks.

Create a failsafe mechanism in test-runner.py that runs a callback
script after every test. The script is common to all tests so all
tests benefit from the protection.

Add an obligatory `zinject -c all` to clear all zinject state after
every test case is run.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Signed-off-by: Ryan Moeller <ryan@iXsystems.com>
Closes #10096
This commit is contained in:
Ryan Moeller 2020-03-10 14:00:56 -04:00 committed by GitHub
parent 1dc32a67e9
commit ddd9ef3a4f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 145 additions and 48 deletions

View File

@ -17,6 +17,8 @@ user = root
timeout = 600 timeout = 600
post_user = root post_user = root
post = cleanup post = cleanup
failsafe_user = root
failsafe = callbacks/zfs_failsafe
outputdir = /var/tmp/test_results outputdir = /var/tmp/test_results
tags = ['functional'] tags = ['functional']

View File

@ -17,6 +17,8 @@ user = root
timeout = 600 timeout = 600
post_user = root post_user = root
post = cleanup post = cleanup
failsafe_user = root
failsafe = callbacks/zfs_failsafe
outputdir = /var/tmp/test_results outputdir = /var/tmp/test_results
tags = ['functional'] tags = ['functional']

View File

@ -17,6 +17,8 @@ user = root
timeout = 600 timeout = 600
post_user = root post_user = root
post = cleanup post = cleanup
failsafe_user = root
failsafe = callbacks/zfs_failsafe
outputdir = /var/tmp/test_results outputdir = /var/tmp/test_results
tags = ['functional'] tags = ['functional']

View File

@ -173,10 +173,13 @@ class Cmd(object):
self.timeout = 60 self.timeout = 60
def __str__(self): def __str__(self):
return "Pathname: %s\nIdentifier: %s\nOutputdir: %s\nTimeout: %d\n" \ return '''\
"User: %s\n" % \ Pathname: %s
(self.pathname, self.identifier, self.outputdir, self.timeout, Identifier: %s
self.user) Outputdir: %s
Timeout: %d
User: %s
''' % (self.pathname, self.identifier, self.outputdir, self.timeout, self.user)
def kill_cmd(self, proc, keyboard_interrupt=False): def kill_cmd(self, proc, keyboard_interrupt=False):
""" """
@ -305,7 +308,7 @@ class Cmd(object):
self.result.runtime = '%02d:%02d' % (m, s) self.result.runtime = '%02d:%02d' % (m, s)
self.result.result = 'SKIP' self.result.result = 'SKIP'
def log(self, options): def log(self, options, suppress_console=False):
""" """
This function is responsible for writing all output. This includes This function is responsible for writing all output. This includes
the console output, the logfile of all results (with timestamped the console output, the logfile of all results (with timestamped
@ -328,12 +331,14 @@ class Cmd(object):
# The result line is always written to the log file. If -q was # The result line is always written to the log file. If -q was
# specified only failures are written to the console, otherwise # specified only failures are written to the console, otherwise
# the result line is written to the console. # the result line is written to the console. The console output
# may be suppressed by calling log() with suppress_console=True.
write_log(bytearray(result_line, encoding='utf-8'), LOG_FILE) write_log(bytearray(result_line, encoding='utf-8'), LOG_FILE)
if not options.quiet: if not suppress_console:
write_log(result_line, LOG_OUT) if not options.quiet:
elif options.quiet and self.result.result != 'PASS': write_log(result_line, LOG_OUT)
write_log(result_line, LOG_OUT) elif options.quiet and self.result.result != 'PASS':
write_log(result_line, LOG_OUT)
lines = sorted(self.result.stdout + self.result.stderr, lines = sorted(self.result.stdout + self.result.stderr,
key=lambda x: x[0]) key=lambda x: x[0])
@ -361,36 +366,49 @@ class Cmd(object):
class Test(Cmd): class Test(Cmd):
props = ['outputdir', 'timeout', 'user', 'pre', 'pre_user', 'post', props = ['outputdir', 'timeout', 'user', 'pre', 'pre_user', 'post',
'post_user', 'tags'] 'post_user', 'failsafe', 'failsafe_user', 'tags']
def __init__(self, pathname, def __init__(self, pathname,
pre=None, pre_user=None, post=None, post_user=None, tags=None, pre=None, pre_user=None, post=None, post_user=None,
**kwargs): failsafe=None, failsafe_user=None, tags=None, **kwargs):
super(Test, self).__init__(pathname, **kwargs) super(Test, self).__init__(pathname, **kwargs)
self.pre = pre or '' self.pre = pre or ''
self.pre_user = pre_user or '' self.pre_user = pre_user or ''
self.post = post or '' self.post = post or ''
self.post_user = post_user or '' self.post_user = post_user or ''
self.failsafe = failsafe or ''
self.failsafe_user = failsafe_user or ''
self.tags = tags or [] self.tags = tags or []
def __str__(self): def __str__(self):
post_user = pre_user = '' post_user = pre_user = failsafe_user = ''
if len(self.pre_user): if len(self.pre_user):
pre_user = ' (as %s)' % (self.pre_user) pre_user = ' (as %s)' % (self.pre_user)
if len(self.post_user): if len(self.post_user):
post_user = ' (as %s)' % (self.post_user) post_user = ' (as %s)' % (self.post_user)
return "Pathname: %s\nIdentifier: %s\nOutputdir: %s\nTimeout: %d\n" \ if len(self.failsafe_user):
"Pre: %s%s\nPost: %s%s\nUser: %s\nTags: %s\n" % \ failsafe_user = ' (as %s)' % (self.failsafe_user)
(self.pathname, self.identifier, self.outputdir, self.timeout, return '''\
self.pre, pre_user, self.post, post_user, self.user, self.tags) Pathname: %s
Identifier: %s
Outputdir: %s
Timeout: %d
User: %s
Pre: %s%s
Post: %s%s
Failsafe: %s%s
Tags: %s
''' % (self.pathname, self.identifier, self.outputdir, self.timeout, self.user,
self.pre, pre_user, self.post, post_user, self.failsafe,
failsafe_user, self.tags)
def verify(self): def verify(self):
""" """
Check the pre/post scripts, user and Test. Omit the Test from this Check the pre/post/failsafe scripts, user and Test. Omit the Test from
run if there are any problems. this run if there are any problems.
""" """
files = [self.pre, self.pathname, self.post] files = [self.pre, self.pathname, self.post, self.failsafe]
users = [self.pre_user, self.user, self.post_user] users = [self.pre_user, self.user, self.post_user, self.failsafe_user]
for f in [f for f in files if len(f)]: for f in [f for f in files if len(f)]:
if not verify_file(f): if not verify_file(f):
@ -408,8 +426,9 @@ class Test(Cmd):
def run(self, options): def run(self, options):
""" """
Create Cmd instances for the pre/post scripts. If the pre script Create Cmd instances for the pre/post/failsafe scripts. If the pre
doesn't pass, skip this Test. Run the post script regardless. script doesn't pass, skip this Test. Run the post script regardless.
If the Test is killed, also run the failsafe script.
""" """
odir = os.path.join(self.outputdir, os.path.basename(self.pre)) odir = os.path.join(self.outputdir, os.path.basename(self.pre))
pretest = Cmd(self.pre, identifier=self.identifier, outputdir=odir, pretest = Cmd(self.pre, identifier=self.identifier, outputdir=odir,
@ -417,6 +436,10 @@ class Test(Cmd):
test = Cmd(self.pathname, identifier=self.identifier, test = Cmd(self.pathname, identifier=self.identifier,
outputdir=self.outputdir, timeout=self.timeout, outputdir=self.outputdir, timeout=self.timeout,
user=self.user) user=self.user)
odir = os.path.join(self.outputdir, os.path.basename(self.failsafe))
failsafe = Cmd(self.failsafe, identifier=self.identifier,
outputdir=odir, timeout=self.timeout,
user=self.failsafe_user)
odir = os.path.join(self.outputdir, os.path.basename(self.post)) odir = os.path.join(self.outputdir, os.path.basename(self.post))
posttest = Cmd(self.post, identifier=self.identifier, outputdir=odir, posttest = Cmd(self.post, identifier=self.identifier, outputdir=odir,
timeout=self.timeout, user=self.post_user) timeout=self.timeout, user=self.post_user)
@ -429,6 +452,9 @@ class Test(Cmd):
if cont: if cont:
test.run(options.dryrun) test.run(options.dryrun)
if test.result.result == 'KILLED' and len(failsafe.pathname):
failsafe.run(options.dryrun)
failsafe.log(options, suppress_console=True)
else: else:
test.skip() test.skip()
@ -447,35 +473,48 @@ class TestGroup(Test):
self.tests = tests or [] self.tests = tests or []
def __str__(self): def __str__(self):
post_user = pre_user = '' post_user = pre_user = failsafe_user = ''
if len(self.pre_user): if len(self.pre_user):
pre_user = ' (as %s)' % (self.pre_user) pre_user = ' (as %s)' % (self.pre_user)
if len(self.post_user): if len(self.post_user):
post_user = ' (as %s)' % (self.post_user) post_user = ' (as %s)' % (self.post_user)
return "Pathname: %s\nIdentifier: %s\nOutputdir: %s\nTests: %s\n" \ if len(self.failsafe_user):
"Timeout: %s\nPre: %s%s\nPost: %s%s\nUser: %s\nTags: %s\n" % \ failsafe_user = ' (as %s)' % (self.failsafe_user)
(self.pathname, self.identifier, self.outputdir, self.tests, return '''\
self.timeout, self.pre, pre_user, self.post, post_user, Pathname: %s
self.user, self.tags) Identifier: %s
Outputdir: %s
Tests: %s
Timeout: %s
User: %s
Pre: %s%s
Post: %s%s
Failsafe: %s%s
Tags: %s
''' % (self.pathname, self.identifier, self.outputdir, self.tests,
self.timeout, self.user, self.pre, pre_user, self.post, post_user,
self.failsafe, failsafe_user, self.tags)
def verify(self): def verify(self):
""" """
Check the pre/post scripts, user and tests in this TestGroup. Omit Check the pre/post/failsafe scripts, user and tests in this TestGroup.
the TestGroup entirely, or simply delete the relevant tests in the Omit the TestGroup entirely, or simply delete the relevant tests in the
group, if that's all that's required. group, if that's all that's required.
""" """
# If the pre or post scripts are relative pathnames, convert to # If the pre/post/failsafe scripts are relative pathnames, convert to
# absolute, so they stand a chance of passing verification. # absolute, so they stand a chance of passing verification.
if len(self.pre) and not os.path.isabs(self.pre): if len(self.pre) and not os.path.isabs(self.pre):
self.pre = os.path.join(self.pathname, self.pre) self.pre = os.path.join(self.pathname, self.pre)
if len(self.post) and not os.path.isabs(self.post): if len(self.post) and not os.path.isabs(self.post):
self.post = os.path.join(self.pathname, self.post) self.post = os.path.join(self.pathname, self.post)
if len(self.failsafe) and not os.path.isabs(self.failsafe):
self.post = os.path.join(self.pathname, self.post)
auxfiles = [self.pre, self.post] auxfiles = [self.pre, self.post, self.failsafe]
users = [self.pre_user, self.user, self.post_user] users = [self.pre_user, self.user, self.post_user, self.failsafe_user]
for f in [f for f in auxfiles if len(f)]: for f in [f for f in auxfiles if len(f)]:
if self.pathname != os.path.dirname(f): if f != self.failsafe and self.pathname != os.path.dirname(f):
write_log("Warning: TestGroup '%s' not added to this run. " write_log("Warning: TestGroup '%s' not added to this run. "
"Auxiliary script '%s' exists in a different " "Auxiliary script '%s' exists in a different "
"directory.\n" % (self.pathname, f), LOG_ERR) "directory.\n" % (self.pathname, f), LOG_ERR)
@ -505,9 +544,9 @@ class TestGroup(Test):
def run(self, options): def run(self, options):
""" """
Create Cmd instances for the pre/post scripts. If the pre script Create Cmd instances for the pre/post/failsafe scripts. If the pre
doesn't pass, skip all the tests in this TestGroup. Run the post script doesn't pass, skip all the tests in this TestGroup. Run the
script regardless. post script regardless. Run the failsafe script when a test is killed.
""" """
# tags assigned to this test group also include the test names # tags assigned to this test group also include the test names
if options.tags and not set(self.tags).intersection(set(options.tags)): if options.tags and not set(self.tags).intersection(set(options.tags)):
@ -527,12 +566,18 @@ class TestGroup(Test):
pretest.log(options) pretest.log(options)
for fname in self.tests: for fname in self.tests:
test = Cmd(os.path.join(self.pathname, fname), odir = os.path.join(self.outputdir, fname)
outputdir=os.path.join(self.outputdir, fname), test = Cmd(os.path.join(self.pathname, fname), outputdir=odir,
timeout=self.timeout, user=self.user, timeout=self.timeout, user=self.user,
identifier=self.identifier) identifier=self.identifier)
odir = os.path.join(odir, os.path.basename(self.failsafe))
failsafe = Cmd(self.failsafe, outputdir=odir, timeout=self.timeout,
user=self.failsafe_user, identifier=self.identifier)
if cont: if cont:
test.run(options.dryrun) test.run(options.dryrun)
if test.result.result == 'KILLED' and len(failsafe.pathname):
failsafe.run(options.dryrun)
failsafe.log(options, suppress_console=True)
else: else:
test.skip() test.skip()
@ -562,6 +607,8 @@ class TestRun(object):
('pre_user', ''), ('pre_user', ''),
('post', ''), ('post', ''),
('post_user', ''), ('post_user', ''),
('failsafe', ''),
('failsafe_user', ''),
('tags', []) ('tags', [])
] ]
@ -599,8 +646,8 @@ class TestRun(object):
for prop in Test.props: for prop in Test.props:
setattr(testgroup, prop, getattr(options, prop)) setattr(testgroup, prop, getattr(options, prop))
# Prevent pre/post scripts from running as regular tests # Prevent pre/post/failsafe scripts from running as regular tests
for f in [testgroup.pre, testgroup.post]: for f in [testgroup.pre, testgroup.post, testgroup.failsafe]:
if f in filenames: if f in filenames:
del filenames[filenames.index(f)] del filenames[filenames.index(f)]
@ -630,6 +677,8 @@ class TestRun(object):
setattr(self, opt, config.get('DEFAULT', opt)) setattr(self, opt, config.get('DEFAULT', opt))
self.outputdir = os.path.join(self.outputdir, self.timestamp) self.outputdir = os.path.join(self.outputdir, self.timestamp)
testdir = options.testdir
for section in config.sections(): for section in config.sections():
if 'tests' in config.options(section): if 'tests' in config.options(section):
parts = section.split(':', 1) parts = section.split(':', 1)
@ -637,8 +686,8 @@ class TestRun(object):
identifier = parts[1] if len(parts) == 2 else None identifier = parts[1] if len(parts) == 2 else None
if os.path.isdir(sectiondir): if os.path.isdir(sectiondir):
pathname = sectiondir pathname = sectiondir
elif os.path.isdir(os.path.join(options.testdir, sectiondir)): elif os.path.isdir(os.path.join(testdir, sectiondir)):
pathname = os.path.join(options.testdir, sectiondir) pathname = os.path.join(testdir, sectiondir)
else: else:
pathname = sectiondir pathname = sectiondir
@ -647,9 +696,13 @@ class TestRun(object):
for prop in TestGroup.props: for prop in TestGroup.props:
for sect in ['DEFAULT', section]: for sect in ['DEFAULT', section]:
if config.has_option(sect, prop): if config.has_option(sect, prop):
if prop == "tags": if prop == 'tags':
setattr(testgroup, prop, setattr(testgroup, prop,
eval(config.get(sect, prop))) eval(config.get(sect, prop)))
elif prop == 'failsafe':
failsafe = config.get(sect, prop)
setattr(testgroup, prop,
os.path.join(testdir, failsafe))
else: else:
setattr(testgroup, prop, setattr(testgroup, prop,
config.get(sect, prop)) config.get(sect, prop))
@ -664,7 +717,12 @@ class TestRun(object):
for prop in Test.props: for prop in Test.props:
for sect in ['DEFAULT', section]: for sect in ['DEFAULT', section]:
if config.has_option(sect, prop): if config.has_option(sect, prop):
setattr(test, prop, config.get(sect, prop)) if prop == 'failsafe':
failsafe = config.get(sect, prop)
setattr(test, prop,
os.path.join(testdir, failsafe))
else:
setattr(test, prop, config.get(sect, prop))
if test.verify(): if test.verify():
self.tests[section] = test self.tests[section] = test
@ -706,7 +764,8 @@ class TestRun(object):
outputdir, and are guaranteed uniqueness because a group can only outputdir, and are guaranteed uniqueness because a group can only
contain files in one directory. Pre and post tests will create a contain files in one directory. Pre and post tests will create a
directory rooted at the outputdir of the Test or TestGroup in directory rooted at the outputdir of the Test or TestGroup in
question for their output. question for their output. Failsafe scripts will create a directory
rooted at the outputdir of each Test for their output.
""" """
done = False done = False
components = 0 components = 0
@ -932,6 +991,13 @@ def parse_args():
type='string', help='Specify a post script.') type='string', help='Specify a post script.')
parser.add_option('-q', action='store_true', default=False, dest='quiet', parser.add_option('-q', action='store_true', default=False, dest='quiet',
help='Silence on the console during a test run.') help='Silence on the console during a test run.')
parser.add_option('-s', action='callback', callback=options_cb,
default='', dest='failsafe', metavar='script',
type='string', help='Specify a failsafe script.')
parser.add_option('-S', action='callback', callback=options_cb,
default='', dest='failsafe_user',
metavar='failsafe_user', type='string',
help='Specify a user to execute the failsafe script.')
parser.add_option('-t', action='callback', callback=options_cb, default=60, parser.add_option('-t', action='callback', callback=options_cb, default=60,
dest='timeout', metavar='seconds', type='int', dest='timeout', metavar='seconds', type='int',
help='Timeout (in seconds) for an individual test.') help='Timeout (in seconds) for an individual test.')

View File

@ -251,6 +251,22 @@ Run \fIscript\fR after any test or test group.
Print only the results summary to the standard output. Print only the results summary to the standard output.
.RE .RE
.ne 2
.na
\fB-s\fR \fIscript\fR
.ad
.RS 6n
Run \fIscript\fR as a failsafe after any test is killed.
.RE
.ne 2
.na
\fB-S\fR \fIusername\fR
.ad
.RS 6n
Execute the failsafe script as \fIusername\fR.
.RE
.ne 2 .ne 2
.na .na
\fB-t\fR \fIn\fR \fB-t\fR \fIn\fR

View File

@ -1,5 +1,6 @@
pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/callbacks pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/callbacks
dist_pkgdata_SCRIPTS = \ dist_pkgdata_SCRIPTS = \
zfs_failsafe.ksh \
zfs_dbgmsg.ksh \ zfs_dbgmsg.ksh \
zfs_dmesg.ksh \ zfs_dmesg.ksh \
zfs_mmp.ksh zfs_mmp.ksh

View File

@ -0,0 +1,8 @@
#!/bin/ksh
# Commands to perform failsafe-critical cleanup after a test is killed.
#
# This should only be used to ensure the system is restored to a functional
# state in the event of tests being killed (preventing normal cleanup).
zinject -c all