# # This file and its contents are supplied under the terms of the # Common Development and Distribution License ("CDDL"), version 1.0. # You may only use this file in accordance with the terms of version # 1.0 of the CDDL. # # A full copy of the text of the CDDL should have accompanied this # source. A copy of the CDDL is also available via the Internet at # http://www.illumos.org/license/CDDL. # # # Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # Copyright (c) 2012, 2019 by Delphix. All rights reserved. # Copyright 2016 Nexenta Systems, Inc. # Copyright (c) 2016, 2017 by Intel Corporation. All rights reserved. # Copyright (c) 2017 Lawrence Livermore National Security, LLC. # Copyright (c) 2017 Datto Inc. # Copyright (c) 2017 Open-E, Inc. All Rights Reserved. # Copyright 2019 Richard Elling # # # Returns SCSI host number for the given disk # function get_scsi_host #disk { typeset disk=$1 ls /sys/block/${disk}/device/scsi_device | cut -d : -f 1 } # # Cause a scan of all scsi host adapters by default # # $1 optional host number # function scan_scsi_hosts { typeset hostnum=${1} if is_linux; then if [[ -z $hostnum ]]; then for host in /sys/class/scsi_host/host*; do log_must eval "echo '- - -' > $host/scan" done else log_must eval \ "echo /sys/class/scsi_host/host$hostnum/scan" \ > /dev/null log_must eval \ "echo '- - -' > /sys/class/scsi_host/host$hostnum/scan" fi fi } # # Wait for newly created block devices to have their minors created. # Additional arguments can be passed to udevadm trigger, with the expected # arguments to typically be a block device pathname. This is useful when # checking waiting on a specific device to settle rather than triggering # all devices and waiting for them all to settle. # # The udevadm settle timeout can be 120 or 180 seconds by default for # some distros. If a long delay is experienced, it could be due to some # strangeness in a malfunctioning device that isn't related to the devices # under test. To help debug this condition, a notice is given if settle takes # too long. # # Note: there is no meaningful return code if udevadm fails. Consumers # should not expect a return code (do not call as argument to log_must) # function block_device_wait { if is_linux; then udevadm trigger $* typeset start=$SECONDS udevadm settle typeset elapsed=$((SECONDS - start)) [[ $elapsed > 60 ]] && \ log_note udevadm settle time too long: $elapsed elif is_freebsd; then if [[ ${#@} -eq 0 ]]; then # Do something that has to go through the geom event # queue to complete. sysctl kern.geom.conftxt >/dev/null return fi fi # Poll for the given paths to appear, but give up eventually. typeset -i i for (( i = 0; i < 5; ++i )); do typeset missing=false typeset dev for dev in "${@}"; do if ! [[ -f $dev ]]; then missing=true break fi done if ! $missing; then break fi sleep ${#@} done } # # Check if the given device is physical device # function is_physical_device #device { typeset device=${1#$DEV_DSKDIR/} device=${device#$DEV_RDSKDIR/} if is_linux; then is_disk_device "$DEV_DSKDIR/$device" && \ [[ -f /sys/module/loop/parameters/max_part ]] return $? elif is_freebsd; then is_disk_device "$DEV_DSKDIR/$device" && \ echo $device | egrep -q \ -e '^a?da[0-9]+$' \ -e '^md[0-9]+$' \ -e '^mfid[0-9]+$' \ -e '^nda[0-9]+$' \ -e '^nvd[0-9]+$' \ -e '^vtbd[0-9]+$' return $? else echo $device | egrep "^c[0-F]+([td][0-F]+)+$" > /dev/null 2>&1 return $? fi } # # Check if the given device is a real device (ie SCSI device) # function is_real_device #disk { typeset disk=$1 [[ -z $disk ]] && log_fail "No argument for disk given." if is_linux; then lsblk $DEV_RDSKDIR/$disk -o TYPE 2>/dev/null | \ egrep disk >/dev/null return $? fi } # # Check if the given device is a loop device # function is_loop_device #disk { typeset disk=$1 [[ -z $disk ]] && log_fail "No argument for disk given." if is_linux; then lsblk $DEV_RDSKDIR/$disk -o TYPE 2>/dev/null | \ egrep loop >/dev/null return $? fi } # # Linux: # Check if the given device is a multipath device and if there is a symbolic # link to a device mapper and to a disk # Currently no support for dm devices alone without multipath # # FreeBSD: # Check if the given device is a gmultipath device. # # Others: # No multipath detection. # function is_mpath_device #disk { typeset disk=$1 [[ -z $disk ]] && log_fail "No argument for disk given." if is_linux; then lsblk $DEV_MPATHDIR/$disk -o TYPE 2>/dev/null | \ egrep mpath >/dev/null if (($? == 0)); then readlink $DEV_MPATHDIR/$disk > /dev/null 2>&1 return $? else return $? fi elif is_freebsd; then is_disk_device $DEV_MPATHDIR/$disk else false fi } # # Check if the given path is the appropriate sort of device special node. # function is_disk_device #path { typeset path=$1 if is_freebsd; then # FreeBSD doesn't have block devices, only character devices. test -c $path else test -b $path fi } # Set the slice prefix for disk partitioning depending # on whether the device is a real, multipath, or loop device. # Currently all disks have to be of the same type, so only # checks first disk to determine slice prefix. # function set_slice_prefix { typeset disk typeset -i i=0 if is_linux; then while (( i < $DISK_ARRAY_NUM )); do disk="$(echo $DISKS | nawk '{print $(i + 1)}')" if ( is_mpath_device $disk ) && [[ -z $(echo $disk | awk 'substr($1,18,1)\ ~ /^[[:digit:]]+$/') ]] || ( is_real_device $disk ); then export SLICE_PREFIX="" return 0 elif ( is_mpath_device $disk || is_loop_device \ $disk ); then export SLICE_PREFIX="p" return 0 else log_fail "$disk not supported for partitioning." fi (( i = i + 1)) done fi } # # Set the directory path of the listed devices in $DISK_ARRAY_NUM # Currently all disks have to be of the same type, so only # checks first disk to determine device directory # default = /dev (linux) # real disk = /dev (linux) # multipath device = /dev/mapper (linux) # function set_device_dir { typeset disk typeset -i i=0 if is_linux; then while (( i < $DISK_ARRAY_NUM )); do disk="$(echo $DISKS | nawk '{print $(i + 1)}')" if is_mpath_device $disk; then export DEV_DSKDIR=$DEV_MPATHDIR return 0 else export DEV_DSKDIR=$DEV_RDSKDIR return 0 fi (( i = i + 1)) done else export DEV_DSKDIR=$DEV_RDSKDIR fi } # # Get the directory path of given device # function get_device_dir #device { typeset device=$1 if ! is_freebsd && ! is_physical_device $device; then if [[ $device != "/" ]]; then device=${device%/*} fi if is_disk_device "$DEV_DSKDIR/$device"; then device="$DEV_DSKDIR" fi echo $device else echo "$DEV_DSKDIR" fi } # # Get persistent name for given disk # function get_persistent_disk_name #device { typeset device=$1 typeset dev_id if is_linux; then if is_real_device $device; then dev_id="$(udevadm info -q all -n $DEV_DSKDIR/$device \ | egrep disk/by-id | nawk '{print $2; exit}' \ | nawk -F / '{print $3}')" echo $dev_id elif is_mpath_device $device; then dev_id="$(udevadm info -q all -n $DEV_DSKDIR/$device \ | egrep disk/by-id/dm-uuid \ | nawk '{print $2; exit}' \ | nawk -F / '{print $3}')" echo $dev_id else echo $device fi else echo $device fi } # # Online or offline a disk on the system # # First checks state of disk. Test will fail if disk is not properly onlined # or offlined. Online is a full rescan of SCSI disks by echoing to every # host entry. # function on_off_disk # disk state{online,offline} host { typeset disk=$1 typeset state=$2 typeset host=$3 [[ -z $disk ]] || [[ -z $state ]] && \ log_fail "Arguments invalid or missing" if is_linux; then if [[ $state == "offline" ]] && ( is_mpath_device $disk ); then dm_name="$(readlink $DEV_DSKDIR/$disk \ | nawk -F / '{print $2}')" dep="$(ls /sys/block/${dm_name}/slaves \ | nawk '{print $1}')" while [[ -n $dep ]]; do #check if disk is online lsscsi | egrep $dep > /dev/null if (($? == 0)); then dep_dir="/sys/block/${dm_name}" dep_dir+="/slaves/${dep}/device" ss="${dep_dir}/state" sd="${dep_dir}/delete" log_must eval "echo 'offline' > ${ss}" log_must eval "echo '1' > ${sd}" lsscsi | egrep $dep > /dev/null if (($? == 0)); then log_fail "Offlining" \ "$disk failed" fi fi dep="$(ls /sys/block/$dm_name/slaves \ 2>/dev/null | nawk '{print $1}')" done elif [[ $state == "offline" ]] && ( is_real_device $disk ); then #check if disk is online lsscsi | egrep $disk > /dev/null if (($? == 0)); then dev_state="/sys/block/$disk/device/state" dev_delete="/sys/block/$disk/device/delete" log_must eval "echo 'offline' > ${dev_state}" log_must eval "echo '1' > ${dev_delete}" lsscsi | egrep $disk > /dev/null if (($? == 0)); then log_fail "Offlining $disk" \ "failed" fi else log_note "$disk is already offline" fi elif [[ $state == "online" ]]; then #force a full rescan scan_scsi_hosts $host block_device_wait if is_mpath_device $disk; then dm_name="$(readlink $DEV_DSKDIR/$disk \ | nawk -F / '{print $2}')" dep="$(ls /sys/block/$dm_name/slaves \ | nawk '{print $1}')" lsscsi | egrep $dep > /dev/null if (($? != 0)); then log_fail "Onlining $disk failed" fi elif is_real_device $disk; then block_device_wait typeset -i retries=0 while ! lsscsi | egrep -q $disk; do if (( $retries > 2 )); then log_fail "Onlining $disk failed" break fi (( ++retries )) sleep 1 done else log_fail "$disk is not a real dev" fi else log_fail "$disk failed to $state" fi fi } # # Simulate disk removal # function remove_disk #disk { typeset disk=$1 on_off_disk $disk "offline" block_device_wait } # # Simulate disk insertion for the given SCSI host # function insert_disk #disk scsi_host { typeset disk=$1 typeset scsi_host=$2 on_off_disk $disk "online" $scsi_host block_device_wait } # # Load scsi_debug module with specified parameters # $blksz can be either one of: < 512b | 512e | 4Kn > # function load_scsi_debug # dev_size_mb add_host num_tgts max_luns blksz { typeset devsize=$1 typeset hosts=$2 typeset tgts=$3 typeset luns=$4 typeset blksz=$5 [[ -z $devsize ]] || [[ -z $hosts ]] || [[ -z $tgts ]] || \ [[ -z $luns ]] || [[ -z $blksz ]] && \ log_fail "Arguments invalid or missing" case "$5" in '512b') typeset sector=512 typeset blkexp=0 ;; '512e') typeset sector=512 typeset blkexp=3 ;; '4Kn') typeset sector=4096 typeset blkexp=0 ;; *) log_fail "Unsupported blksz value: $5" ;; esac if is_linux; then modprobe -n scsi_debug if (($? != 0)); then log_unsupported "Platform does not have scsi_debug" "module" fi lsmod | egrep scsi_debug > /dev/null if (($? == 0)); then log_fail "scsi_debug module already installed" else log_must modprobe scsi_debug dev_size_mb=$devsize \ add_host=$hosts num_tgts=$tgts max_luns=$luns \ sector_size=$sector physblk_exp=$blkexp block_device_wait lsscsi | egrep scsi_debug > /dev/null if (($? == 1)); then log_fail "scsi_debug module install failed" fi fi fi } # # Unload scsi_debug module, if needed. # function unload_scsi_debug { log_must_retry "in use" 5 modprobe -r scsi_debug } # # Get scsi_debug device name. # Returns basename of scsi_debug device (for example "sdb"). # function get_debug_device { for i in {1..10} ; do val=$(lsscsi | nawk '/scsi_debug/ {print $6; exit}' | cut -d / -f3) # lsscsi can take time to settle if [ "$val" != "-" ] ; then break fi sleep 1 done echo "$val" } # # Get actual devices used by the pool (i.e. linux sdb1 not sdb). # function get_pool_devices #testpool #devdir { typeset testpool=$1 typeset devdir=$2 typeset out="" if is_linux || is_freebsd; then out=$(zpool status -P $testpool |grep ${devdir} | awk '{print $1}') out=$(echo $out | sed -e "s|${devdir}/||g" | tr '\n' ' ') fi echo $out } # # Write to standard out giving the level, device name, offset and length # of all blocks in an input file. The offset and length are in units of # 512 byte blocks. In the case of mirrored vdevs, only the first # device is listed, as the levels, blocks and offsets will be the same # on other devices. Note that this function only works with mirrored # or non-redundant pools, not raidz. # # The output of this function can be used to introduce corruption at # varying levels of indirection. # function list_file_blocks # input_file { typeset input_file=$1 [[ -f $input_file ]] || log_fail "Couldn't find $input_file" typeset ds="$(zfs list -H -o name $input_file)" typeset pool="${ds%%/*}" typeset objnum="$(get_objnum $input_file)" # # Establish a mapping between vdev ids as shown in a DVA and the # pathnames they correspond to in ${VDEV_MAP[][]}. # # The vdev bits in a DVA refer to the top level vdev id. # ${VDEV_MAP[$id]} is an array of the vdev paths within that vdev. # eval $(zdb -C $pool | awk ' BEGIN { printf "typeset -a VDEV_MAP;" } function subscript(s) { # "[#]" is more convenient than the bare "#" match(s, /\[[0-9]*\]/) return substr(s, RSTART, RLENGTH) } id && !/^ / { # left a top level vdev id = 0 } id && $1 ~ /^path:$/ { # found a vdev path; save it in the map printf "VDEV_MAP%s%s=%s;", id, child, $2 } /^ children/ { # entering a top level vdev id = subscript($0) child = "[0]" # default in case there is no nested vdev printf "typeset -a VDEV_MAP%s;", id } /^ children/ { # entering a nested vdev (e.g. child of a top level mirror) child = subscript($0) } ') # # The awk below parses the output of zdb, printing out the level # of each block along with vdev id, offset and length. The last # two are converted to decimal in the while loop. 4M is added to # the offset to compensate for the first two labels and boot # block. Lastly, the offset and length are printed in units of # 512b blocks for ease of use with dd. # log_must zpool sync -f typeset level path offset length zdb -ddddd $ds $objnum | awk -F: ' /^Indirect blocks:/ { looking = 1 } /^\t\tsegment / { looking = 0 } /L[0-8]/ && looking { print } ' | sed -n 's/^.*\(L[0-9]\) *\([0-9]*\):\([0-9a-f]*\):\([0-9a-f]*\) .*$/\1 \2 \3 \4/p' | \ while read level vdev offset length; do offset=$((16#$offset)) # Conversion from hex length=$((16#$length)) offset="$(((offset + 4 * 1024 * 1024) / 512))" length="$((length / 512))" for path in ${VDEV_MAP[$vdev][@]}; do echo "$level $path $offset $length" done done 2>/dev/null } function corrupt_blocks_at_level # input_file corrupt_level { typeset input_file=$1 typeset corrupt_level="L${2:-0}" typeset level path offset length [[ -f $input_file ]] || log_fail "Couldn't find $input_file" if is_freebsd; then # Temporarily allow corrupting an inuse device. debugflags=$(sysctl -n kern.geom.debugflags) sysctl kern.geom.debugflags=16 fi list_file_blocks $input_file | \ while read level path offset length; do if [[ $level = $corrupt_level ]]; then log_must dd if=/dev/urandom of=$path bs=512 \ count=$length seek=$offset conv=notrunc fi done if is_freebsd; then sysctl kern.geom.debugflags=$debugflags fi # This is necessary for pools made of loop devices. sync }