Allow zfs-tests to recover from hibernation

When a system sleeps during a zfs-test, the time spent
hibernating is counted against the test's runtime even
though the test can't and isn't running.
This patch tries to detect timeouts due to hibernation and
reruns tests that timed out due to system sleeping.
In this version of the patch, the existing behavior of returning
non-zero when a test was killed is preserved. With this patch applied
we still return nonzero and we also automatically rerun the test we
suspect of being killed due to system hibernation.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: John Kennedy <john.kennedy@delphix.com>
Signed-off-by: Alek Pinchuk <apinchuk@datto.com>
Closes #8575
This commit is contained in:
Alek P 2019-04-11 10:20:37 -07:00 committed by Brian Behlendorf
parent 48ed0f9da0
commit b31cf30a15
1 changed files with 70 additions and 19 deletions

View File

@ -13,7 +13,7 @@
# #
# Copyright (c) 2012, 2018 by Delphix. All rights reserved. # Copyright (c) 2012, 2018 by Delphix. All rights reserved.
# Copyright (c) 2017 Datto Inc. # Copyright (c) 2019 Datto Inc.
# #
# This script must remain compatible with Python 2.6+ and Python 3.4+. # This script must remain compatible with Python 2.6+ and Python 3.4+.
# #
@ -26,6 +26,7 @@ except ImportError:
import os import os
import sys import sys
import ctypes
from datetime import datetime from datetime import datetime
from optparse import OptionParser from optparse import OptionParser
@ -47,10 +48,33 @@ LOG_OUT = 'LOG_OUT'
LOG_ERR = 'LOG_ERR' LOG_ERR = 'LOG_ERR'
LOG_FILE_OBJ = None LOG_FILE_OBJ = None
# some python 2.7 system don't have a concept of monotonic time
CLOCK_MONOTONIC_RAW = 4 # see <linux/time.h>
class timespec(ctypes.Structure):
_fields_ = [
('tv_sec', ctypes.c_long),
('tv_nsec', ctypes.c_long)
]
librt = ctypes.CDLL('librt.so.1', use_errno=True)
clock_gettime = librt.clock_gettime
clock_gettime.argtypes = [ctypes.c_int, ctypes.POINTER(timespec)]
def monotonic_time():
t = timespec()
if clock_gettime(CLOCK_MONOTONIC_RAW, ctypes.pointer(t)) != 0:
errno_ = ctypes.get_errno()
raise OSError(errno_, os.strerror(errno_))
return t.tv_sec + t.tv_nsec * 1e-9
class Result(object): class Result(object):
total = 0 total = 0
runresults = {'PASS': 0, 'FAIL': 0, 'SKIP': 0, 'KILLED': 0} runresults = {'PASS': 0, 'FAIL': 0, 'SKIP': 0, 'KILLED': 0, 'RERAN': 0}
def __init__(self): def __init__(self):
self.starttime = None self.starttime = None
@ -60,14 +84,16 @@ class Result(object):
self.stderr = [] self.stderr = []
self.result = '' self.result = ''
def done(self, proc, killed): def done(self, proc, killed, reran):
""" """
Finalize the results of this Cmd. Finalize the results of this Cmd.
""" """
Result.total += 1 Result.total += 1
m, s = divmod(time() - self.starttime, 60) m, s = divmod(monotonic_time() - self.starttime, 60)
self.runtime = '%02d:%02d' % (m, s) self.runtime = '%02d:%02d' % (m, s)
self.returncode = proc.returncode self.returncode = proc.returncode
if reran is True:
Result.runresults['RERAN'] += 1
if killed: if killed:
self.result = 'KILLED' self.result = 'KILLED'
Result.runresults['KILLED'] += 1 Result.runresults['KILLED'] += 1
@ -133,9 +159,13 @@ class Cmd(object):
tags=None): tags=None):
self.pathname = pathname self.pathname = pathname
self.outputdir = outputdir or 'BASEDIR' self.outputdir = outputdir or 'BASEDIR'
"""
The timeout for tests is measured in wall-clock time
"""
self.timeout = timeout self.timeout = timeout
self.user = user or '' self.user = user or ''
self.killed = False self.killed = False
self.reran = None
self.result = Result() self.result = Result()
if self.timeout is None: if self.timeout is None:
@ -145,7 +175,7 @@ class Cmd(object):
return "Pathname: %s\nOutputdir: %s\nTimeout: %d\nUser: %s\n" % \ return "Pathname: %s\nOutputdir: %s\nTimeout: %d\nUser: %s\n" % \
(self.pathname, self.outputdir, self.timeout, self.user) (self.pathname, self.outputdir, self.timeout, self.user)
def kill_cmd(self, proc): def kill_cmd(self, proc, keyboard_interrupt=False):
""" """
Kill a running command due to timeout, or ^C from the keyboard. If Kill a running command due to timeout, or ^C from the keyboard. If
sudo is required, this user was verified previously. sudo is required, this user was verified previously.
@ -164,6 +194,20 @@ class Cmd(object):
except Exception: except Exception:
pass pass
"""
If this is not a user-initiated kill and the test has not been
reran before we consider if the test needs to be reran:
If the test has spent some time hibernating and didn't run the whole
length of time before being timed out we will rerun the test.
"""
if keyboard_interrupt is False and self.reran is None:
runtime = monotonic_time() - self.result.starttime
if int(self.timeout) > runtime:
self.killed = False
self.reran = False
self.run(False)
self.reran = True
def update_cmd_privs(self, cmd, user): def update_cmd_privs(self, cmd, user):
""" """
If a user has been specified to run this Cmd and we're not already If a user has been specified to run this Cmd and we're not already
@ -207,13 +251,13 @@ class Cmd(object):
return out.lines, err.lines return out.lines, err.lines
def run(self, options): def run(self, dryrun):
""" """
This is the main function that runs each individual test. This is the main function that runs each individual test.
Determine whether or not the command requires sudo, and modify it Determine whether or not the command requires sudo, and modify it
if needed. Run the command, and update the result object. if needed. Run the command, and update the result object.
""" """
if options.dryrun is True: if dryrun is True:
print(self) print(self)
return return
@ -226,7 +270,7 @@ class Cmd(object):
except OSError as e: except OSError as e:
fail('%s' % e) fail('%s' % e)
self.result.starttime = time() self.result.starttime = monotonic_time()
proc = Popen(privcmd, stdout=PIPE, stderr=PIPE) proc = Popen(privcmd, stdout=PIPE, stderr=PIPE)
# Allow a special timeout value of 0 to mean infinity # Allow a special timeout value of 0 to mean infinity
if int(self.timeout) == 0: if int(self.timeout) == 0:
@ -237,12 +281,13 @@ class Cmd(object):
t.start() t.start()
self.result.stdout, self.result.stderr = self.collect_output(proc) self.result.stdout, self.result.stderr = self.collect_output(proc)
except KeyboardInterrupt: except KeyboardInterrupt:
self.kill_cmd(proc) self.kill_cmd(proc, True)
fail('\nRun terminated at user request.') fail('\nRun terminated at user request.')
finally: finally:
t.cancel() t.cancel()
self.result.done(proc, self.killed) if self.reran is not False:
self.result.done(proc, self.killed, self.reran)
def skip(self): def skip(self):
""" """
@ -252,8 +297,8 @@ class Cmd(object):
Result.total += 1 Result.total += 1
Result.runresults['SKIP'] += 1 Result.runresults['SKIP'] += 1
self.result.stdout = self.result.stderr = [] self.result.stdout = self.result.stderr = []
self.result.starttime = time() self.result.starttime = monotonic_time()
m, s = divmod(time() - self.result.starttime, 60) m, s = divmod(monotonic_time() - self.result.starttime, 60)
self.result.runtime = '%02d:%02d' % (m, s) self.result.runtime = '%02d:%02d' % (m, s)
self.result.result = 'SKIP' self.result.result = 'SKIP'
@ -266,9 +311,12 @@ class Cmd(object):
""" """
logname = getpwuid(os.getuid()).pw_name logname = getpwuid(os.getuid()).pw_name
rer = ''
if self.reran is True:
rer = ' (RERAN)'
user = ' (run as %s)' % (self.user if len(self.user) else logname) user = ' (run as %s)' % (self.user if len(self.user) else logname)
msga = 'Test: %s%s ' % (self.pathname, user) msga = 'Test: %s%s ' % (self.pathname, user)
msgb = '[%s] [%s]\n' % (self.result.runtime, self.result.result) msgb = '[%s] [%s]%s\n' % (self.result.runtime, self.result.result, rer)
pad = ' ' * (80 - (len(msga) + len(msgb))) pad = ' ' * (80 - (len(msga) + len(msgb)))
result_line = msga + pad + msgb result_line = msga + pad + msgb
@ -368,19 +416,19 @@ class Test(Cmd):
cont = True cont = True
if len(pretest.pathname): if len(pretest.pathname):
pretest.run(options) pretest.run(options.dryrun)
cont = pretest.result.result == 'PASS' cont = pretest.result.result == 'PASS'
pretest.log(options) pretest.log(options)
if cont: if cont:
test.run(options) test.run(options.dryrun)
else: else:
test.skip() test.skip()
test.log(options) test.log(options)
if len(posttest.pathname): if len(posttest.pathname):
posttest.run(options) posttest.run(options.dryrun)
posttest.log(options) posttest.log(options)
@ -469,7 +517,7 @@ class TestGroup(Test):
cont = True cont = True
if len(pretest.pathname): if len(pretest.pathname):
pretest.run(options) pretest.run(options.dryrun)
cont = pretest.result.result == 'PASS' cont = pretest.result.result == 'PASS'
pretest.log(options) pretest.log(options)
@ -478,14 +526,14 @@ class TestGroup(Test):
outputdir=os.path.join(self.outputdir, fname), outputdir=os.path.join(self.outputdir, fname),
timeout=self.timeout, user=self.user) timeout=self.timeout, user=self.user)
if cont: if cont:
test.run(options) test.run(options.dryrun)
else: else:
test.skip() test.skip()
test.log(options) test.log(options)
if len(posttest.pathname): if len(posttest.pathname):
posttest.run(options) posttest.run(options.dryrun)
posttest.log(options) posttest.log(options)
@ -733,6 +781,9 @@ class TestRun(object):
if Result.runresults['KILLED'] > 0: if Result.runresults['KILLED'] > 0:
return 1 return 1
if Result.runresults['RERAN'] > 0:
return 3
return 0 return 0