Allow zfs-tests to recover from hibernation
When a system sleeps during a zfs-test, the time spent hibernating is counted against the test's runtime even though the test can't and isn't running. This patch tries to detect timeouts due to hibernation and reruns tests that timed out due to system sleeping. In this version of the patch, the existing behavior of returning non-zero when a test was killed is preserved. With this patch applied we still return nonzero and we also automatically rerun the test we suspect of being killed due to system hibernation. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed by: John Kennedy <john.kennedy@delphix.com> Signed-off-by: Alek Pinchuk <apinchuk@datto.com> Closes #8575
This commit is contained in:
parent
48ed0f9da0
commit
b31cf30a15
|
@ -13,7 +13,7 @@
|
||||||
|
|
||||||
#
|
#
|
||||||
# Copyright (c) 2012, 2018 by Delphix. All rights reserved.
|
# Copyright (c) 2012, 2018 by Delphix. All rights reserved.
|
||||||
# Copyright (c) 2017 Datto Inc.
|
# Copyright (c) 2019 Datto Inc.
|
||||||
#
|
#
|
||||||
# This script must remain compatible with Python 2.6+ and Python 3.4+.
|
# This script must remain compatible with Python 2.6+ and Python 3.4+.
|
||||||
#
|
#
|
||||||
|
@ -26,6 +26,7 @@ except ImportError:
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
import ctypes
|
||||||
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from optparse import OptionParser
|
from optparse import OptionParser
|
||||||
|
@ -47,10 +48,33 @@ LOG_OUT = 'LOG_OUT'
|
||||||
LOG_ERR = 'LOG_ERR'
|
LOG_ERR = 'LOG_ERR'
|
||||||
LOG_FILE_OBJ = None
|
LOG_FILE_OBJ = None
|
||||||
|
|
||||||
|
# some python 2.7 system don't have a concept of monotonic time
|
||||||
|
CLOCK_MONOTONIC_RAW = 4 # see <linux/time.h>
|
||||||
|
|
||||||
|
|
||||||
|
class timespec(ctypes.Structure):
|
||||||
|
_fields_ = [
|
||||||
|
('tv_sec', ctypes.c_long),
|
||||||
|
('tv_nsec', ctypes.c_long)
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
librt = ctypes.CDLL('librt.so.1', use_errno=True)
|
||||||
|
clock_gettime = librt.clock_gettime
|
||||||
|
clock_gettime.argtypes = [ctypes.c_int, ctypes.POINTER(timespec)]
|
||||||
|
|
||||||
|
|
||||||
|
def monotonic_time():
|
||||||
|
t = timespec()
|
||||||
|
if clock_gettime(CLOCK_MONOTONIC_RAW, ctypes.pointer(t)) != 0:
|
||||||
|
errno_ = ctypes.get_errno()
|
||||||
|
raise OSError(errno_, os.strerror(errno_))
|
||||||
|
return t.tv_sec + t.tv_nsec * 1e-9
|
||||||
|
|
||||||
|
|
||||||
class Result(object):
|
class Result(object):
|
||||||
total = 0
|
total = 0
|
||||||
runresults = {'PASS': 0, 'FAIL': 0, 'SKIP': 0, 'KILLED': 0}
|
runresults = {'PASS': 0, 'FAIL': 0, 'SKIP': 0, 'KILLED': 0, 'RERAN': 0}
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.starttime = None
|
self.starttime = None
|
||||||
|
@ -60,14 +84,16 @@ class Result(object):
|
||||||
self.stderr = []
|
self.stderr = []
|
||||||
self.result = ''
|
self.result = ''
|
||||||
|
|
||||||
def done(self, proc, killed):
|
def done(self, proc, killed, reran):
|
||||||
"""
|
"""
|
||||||
Finalize the results of this Cmd.
|
Finalize the results of this Cmd.
|
||||||
"""
|
"""
|
||||||
Result.total += 1
|
Result.total += 1
|
||||||
m, s = divmod(time() - self.starttime, 60)
|
m, s = divmod(monotonic_time() - self.starttime, 60)
|
||||||
self.runtime = '%02d:%02d' % (m, s)
|
self.runtime = '%02d:%02d' % (m, s)
|
||||||
self.returncode = proc.returncode
|
self.returncode = proc.returncode
|
||||||
|
if reran is True:
|
||||||
|
Result.runresults['RERAN'] += 1
|
||||||
if killed:
|
if killed:
|
||||||
self.result = 'KILLED'
|
self.result = 'KILLED'
|
||||||
Result.runresults['KILLED'] += 1
|
Result.runresults['KILLED'] += 1
|
||||||
|
@ -133,9 +159,13 @@ class Cmd(object):
|
||||||
tags=None):
|
tags=None):
|
||||||
self.pathname = pathname
|
self.pathname = pathname
|
||||||
self.outputdir = outputdir or 'BASEDIR'
|
self.outputdir = outputdir or 'BASEDIR'
|
||||||
|
"""
|
||||||
|
The timeout for tests is measured in wall-clock time
|
||||||
|
"""
|
||||||
self.timeout = timeout
|
self.timeout = timeout
|
||||||
self.user = user or ''
|
self.user = user or ''
|
||||||
self.killed = False
|
self.killed = False
|
||||||
|
self.reran = None
|
||||||
self.result = Result()
|
self.result = Result()
|
||||||
|
|
||||||
if self.timeout is None:
|
if self.timeout is None:
|
||||||
|
@ -145,7 +175,7 @@ class Cmd(object):
|
||||||
return "Pathname: %s\nOutputdir: %s\nTimeout: %d\nUser: %s\n" % \
|
return "Pathname: %s\nOutputdir: %s\nTimeout: %d\nUser: %s\n" % \
|
||||||
(self.pathname, self.outputdir, self.timeout, self.user)
|
(self.pathname, self.outputdir, self.timeout, self.user)
|
||||||
|
|
||||||
def kill_cmd(self, proc):
|
def kill_cmd(self, proc, keyboard_interrupt=False):
|
||||||
"""
|
"""
|
||||||
Kill a running command due to timeout, or ^C from the keyboard. If
|
Kill a running command due to timeout, or ^C from the keyboard. If
|
||||||
sudo is required, this user was verified previously.
|
sudo is required, this user was verified previously.
|
||||||
|
@ -164,6 +194,20 @@ class Cmd(object):
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
"""
|
||||||
|
If this is not a user-initiated kill and the test has not been
|
||||||
|
reran before we consider if the test needs to be reran:
|
||||||
|
If the test has spent some time hibernating and didn't run the whole
|
||||||
|
length of time before being timed out we will rerun the test.
|
||||||
|
"""
|
||||||
|
if keyboard_interrupt is False and self.reran is None:
|
||||||
|
runtime = monotonic_time() - self.result.starttime
|
||||||
|
if int(self.timeout) > runtime:
|
||||||
|
self.killed = False
|
||||||
|
self.reran = False
|
||||||
|
self.run(False)
|
||||||
|
self.reran = True
|
||||||
|
|
||||||
def update_cmd_privs(self, cmd, user):
|
def update_cmd_privs(self, cmd, user):
|
||||||
"""
|
"""
|
||||||
If a user has been specified to run this Cmd and we're not already
|
If a user has been specified to run this Cmd and we're not already
|
||||||
|
@ -207,13 +251,13 @@ class Cmd(object):
|
||||||
|
|
||||||
return out.lines, err.lines
|
return out.lines, err.lines
|
||||||
|
|
||||||
def run(self, options):
|
def run(self, dryrun):
|
||||||
"""
|
"""
|
||||||
This is the main function that runs each individual test.
|
This is the main function that runs each individual test.
|
||||||
Determine whether or not the command requires sudo, and modify it
|
Determine whether or not the command requires sudo, and modify it
|
||||||
if needed. Run the command, and update the result object.
|
if needed. Run the command, and update the result object.
|
||||||
"""
|
"""
|
||||||
if options.dryrun is True:
|
if dryrun is True:
|
||||||
print(self)
|
print(self)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -226,7 +270,7 @@ class Cmd(object):
|
||||||
except OSError as e:
|
except OSError as e:
|
||||||
fail('%s' % e)
|
fail('%s' % e)
|
||||||
|
|
||||||
self.result.starttime = time()
|
self.result.starttime = monotonic_time()
|
||||||
proc = Popen(privcmd, stdout=PIPE, stderr=PIPE)
|
proc = Popen(privcmd, stdout=PIPE, stderr=PIPE)
|
||||||
# Allow a special timeout value of 0 to mean infinity
|
# Allow a special timeout value of 0 to mean infinity
|
||||||
if int(self.timeout) == 0:
|
if int(self.timeout) == 0:
|
||||||
|
@ -237,12 +281,13 @@ class Cmd(object):
|
||||||
t.start()
|
t.start()
|
||||||
self.result.stdout, self.result.stderr = self.collect_output(proc)
|
self.result.stdout, self.result.stderr = self.collect_output(proc)
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
self.kill_cmd(proc)
|
self.kill_cmd(proc, True)
|
||||||
fail('\nRun terminated at user request.')
|
fail('\nRun terminated at user request.')
|
||||||
finally:
|
finally:
|
||||||
t.cancel()
|
t.cancel()
|
||||||
|
|
||||||
self.result.done(proc, self.killed)
|
if self.reran is not False:
|
||||||
|
self.result.done(proc, self.killed, self.reran)
|
||||||
|
|
||||||
def skip(self):
|
def skip(self):
|
||||||
"""
|
"""
|
||||||
|
@ -252,8 +297,8 @@ class Cmd(object):
|
||||||
Result.total += 1
|
Result.total += 1
|
||||||
Result.runresults['SKIP'] += 1
|
Result.runresults['SKIP'] += 1
|
||||||
self.result.stdout = self.result.stderr = []
|
self.result.stdout = self.result.stderr = []
|
||||||
self.result.starttime = time()
|
self.result.starttime = monotonic_time()
|
||||||
m, s = divmod(time() - self.result.starttime, 60)
|
m, s = divmod(monotonic_time() - self.result.starttime, 60)
|
||||||
self.result.runtime = '%02d:%02d' % (m, s)
|
self.result.runtime = '%02d:%02d' % (m, s)
|
||||||
self.result.result = 'SKIP'
|
self.result.result = 'SKIP'
|
||||||
|
|
||||||
|
@ -266,9 +311,12 @@ class Cmd(object):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
logname = getpwuid(os.getuid()).pw_name
|
logname = getpwuid(os.getuid()).pw_name
|
||||||
|
rer = ''
|
||||||
|
if self.reran is True:
|
||||||
|
rer = ' (RERAN)'
|
||||||
user = ' (run as %s)' % (self.user if len(self.user) else logname)
|
user = ' (run as %s)' % (self.user if len(self.user) else logname)
|
||||||
msga = 'Test: %s%s ' % (self.pathname, user)
|
msga = 'Test: %s%s ' % (self.pathname, user)
|
||||||
msgb = '[%s] [%s]\n' % (self.result.runtime, self.result.result)
|
msgb = '[%s] [%s]%s\n' % (self.result.runtime, self.result.result, rer)
|
||||||
pad = ' ' * (80 - (len(msga) + len(msgb)))
|
pad = ' ' * (80 - (len(msga) + len(msgb)))
|
||||||
result_line = msga + pad + msgb
|
result_line = msga + pad + msgb
|
||||||
|
|
||||||
|
@ -368,19 +416,19 @@ class Test(Cmd):
|
||||||
|
|
||||||
cont = True
|
cont = True
|
||||||
if len(pretest.pathname):
|
if len(pretest.pathname):
|
||||||
pretest.run(options)
|
pretest.run(options.dryrun)
|
||||||
cont = pretest.result.result == 'PASS'
|
cont = pretest.result.result == 'PASS'
|
||||||
pretest.log(options)
|
pretest.log(options)
|
||||||
|
|
||||||
if cont:
|
if cont:
|
||||||
test.run(options)
|
test.run(options.dryrun)
|
||||||
else:
|
else:
|
||||||
test.skip()
|
test.skip()
|
||||||
|
|
||||||
test.log(options)
|
test.log(options)
|
||||||
|
|
||||||
if len(posttest.pathname):
|
if len(posttest.pathname):
|
||||||
posttest.run(options)
|
posttest.run(options.dryrun)
|
||||||
posttest.log(options)
|
posttest.log(options)
|
||||||
|
|
||||||
|
|
||||||
|
@ -469,7 +517,7 @@ class TestGroup(Test):
|
||||||
|
|
||||||
cont = True
|
cont = True
|
||||||
if len(pretest.pathname):
|
if len(pretest.pathname):
|
||||||
pretest.run(options)
|
pretest.run(options.dryrun)
|
||||||
cont = pretest.result.result == 'PASS'
|
cont = pretest.result.result == 'PASS'
|
||||||
pretest.log(options)
|
pretest.log(options)
|
||||||
|
|
||||||
|
@ -478,14 +526,14 @@ class TestGroup(Test):
|
||||||
outputdir=os.path.join(self.outputdir, fname),
|
outputdir=os.path.join(self.outputdir, fname),
|
||||||
timeout=self.timeout, user=self.user)
|
timeout=self.timeout, user=self.user)
|
||||||
if cont:
|
if cont:
|
||||||
test.run(options)
|
test.run(options.dryrun)
|
||||||
else:
|
else:
|
||||||
test.skip()
|
test.skip()
|
||||||
|
|
||||||
test.log(options)
|
test.log(options)
|
||||||
|
|
||||||
if len(posttest.pathname):
|
if len(posttest.pathname):
|
||||||
posttest.run(options)
|
posttest.run(options.dryrun)
|
||||||
posttest.log(options)
|
posttest.log(options)
|
||||||
|
|
||||||
|
|
||||||
|
@ -733,6 +781,9 @@ class TestRun(object):
|
||||||
if Result.runresults['KILLED'] > 0:
|
if Result.runresults['KILLED'] > 0:
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
|
if Result.runresults['RERAN'] > 0:
|
||||||
|
return 3
|
||||||
|
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue