#!/bin/sh # # Show SMART stats # helpstr=" smart: Show SMART temperature and error stats (specific to drive type) smartx: Show SMART extended drive stats (specific to drive type). temp: Show SMART drive temperature in celsius (all drives). health: Show reported SMART status (all drives). r_proc: Show SMART read GBytes processed over drive lifetime (SAS). w_proc: Show SMART write GBytes processed over drive lifetime (SAS). r_ucor: Show SMART read uncorrectable errors (SAS). w_ucor: Show SMART write uncorrectable errors (SAS). nonmed: Show SMART non-medium errors (SAS). defect: Show SMART grown defect list (SAS). hours_on: Show number of hours drive powered on (all drives). realloc: Show SMART reallocated sectors count (ATA). rep_ucor: Show SMART reported uncorrectable count (ATA). cmd_to: Show SMART command timeout count (ATA). pend_sec: Show SMART current pending sector count (ATA). off_ucor: Show SMART offline uncorrectable errors (ATA). ata_err: Show SMART ATA errors (ATA). pwr_cyc: Show SMART power cycle count (ATA). serial: Show disk serial number. nvme_err: Show SMART NVMe errors (NVMe). smart_test: Show SMART self-test results summary. test_type: Show SMART self-test type (short, long... ). test_status: Show SMART self-test status. test_progress: Show SMART self-test percentage done. test_ended: Show when the last SMART self-test ended (if supported). " # Hack for developer testing # # If you set $samples to a directory containing smartctl output text files, # we will use them instead of running smartctl on the vdevs. This can be # useful if you want to test a bunch of different smartctl outputs. Also, if # $samples is set, and additional 'file' column is added to the zpool output # showing the filename. samples= # get_filename_from_dir DIR # # Look in directory DIR and return a filename from it. The filename returned # is chosen quasi-sequentially (based off our PID). This allows us to return # a different filename every time this script is invoked (which we do for each # vdev), without having to maintain state. get_filename_from_dir() { dir=$1 pid="$$" num_files=$(find "$dir" -maxdepth 1 -type f | wc -l) mod=$((pid % num_files)) i=0 find "$dir" -type f -printf "%f\n" | while read -r file ; do if [ "$mod" = "$i" ] ; then echo "$file" break fi i=$((i+1)) done } script=$(basename "$0") if [ "$1" = "-h" ] ; then echo "$helpstr" | grep "$script:" | tr -s '\t' | cut -f 2- exit fi smartctl_path=$(command -v smartctl) if [ -b "$VDEV_UPATH" ] && [ -x "$smartctl_path" ] || [ -n "$samples" ] ; then if [ -n "$samples" ] ; then # cat a smartctl output text file instead of running smartctl # on a vdev (only used for developer testing). file=$(get_filename_from_dir "$samples") echo "file=$file" raw_out=$(cat "$samples/$file") else raw_out=$(eval "sudo $smartctl_path -a $VDEV_UPATH") fi # What kind of drive are we? Look for the right line in smartctl: # # SAS: # Transport protocol: SAS # # SATA: # ATA Version is: 8 # # NVMe: # SMART/Health Information (NVMe Log 0xnn, NSID 0xnn) # out=$(echo "$raw_out" | awk ' # SAS specific /read:/{print "rrd="$4"\nr_cor="$5"\nr_proc="$7"\nr_ucor="$8} /write:/{print "rwr="$4"\nw_cor="$5"\nw_proc="$7"\nw_ucor="$8} /Non-medium error count/{print "nonmed="$4} /Elements in grown defect list/{print "defect="$6} # SAS common /SAS/{type="sas"} /Drive Temperature:/{print "temp="$4} # Status can be a long string, substitute spaces for '_' /SMART Health Status:/{printf "health="; for(i=4;i<=NF-1;i++){printf "%s_", $i}; printf "%s\n", $i} /number of hours powered up/{print "hours_on="$7; hours_on=int($7)} /Serial number:/{print "serial="$3} # SATA specific /Reallocated_Sector_Ct/{print "realloc="$10} /Reported_Uncorrect/{print "rep_ucor="$10} /Command_Timeout/{print "cmd_to="$10} /Current_Pending_Sector/{print "pend_sec="$10} /Offline_Uncorrectable/{print "off_ucor="$10} /ATA Error Count:/{print "ata_err="$4} /Power_Cycle_Count/{print "pwr_cyc="$10} # SATA common /SATA/{type="sata"} /Temperature_Celsius/{print "temp="$10} /Airflow_Temperature_Cel/{print "temp="$10} /Current Temperature:/{print "temp="$3} /SMART overall-health self-assessment test result:/{print "health="$6} /Power_On_Hours/{print "hours_on="$10; hours_on=int($10)} /Serial Number:/{print "serial="$3} # NVMe common /NVMe/{type="nvme"} /Temperature:/{print "temp="$2} /SMART overall-health self-assessment test result:/{print "health="$6} /Power On Hours:/{gsub("[^0-9]","",$4); print "hours_on="$4} /Serial Number:/{print "serial="$3} /Power Cycles:/{print "pwr_cyc="$3} # NVMe specific /Media and Data Integrity Errors:/{print "nvme_err="$6} # SMART self-test info /Self-test execution status:/{progress=tolower($4)} # SAS /SMART Self-test log/{test_seen=1} # SAS /SMART Extended Self-test Log/{test_seen=1} # SATA /# 1/{ test_type=tolower($3"_"$4); # Status could be one word ("Completed") or multiple ("Completed: read # failure"). Look for the ":" to see if we need to grab more words. if ($5 ~ ":") status=tolower($5""$6"_"$7) else status=tolower($5) if (status=="self") status="running"; if (type == "sas") { hours=int($(NF-4)) } else { hours=int($(NF-1)) # SATA reports percent remaining, rather than percent done # Convert it to percent done. progress=(100-int($(NF-2)))"%" } # When we int()-ify "hours", it converts stuff like "NOW" and "-" into # 0. In those cases, set it to hours_on, so they will cancel out in # the "hours_ago" calculation later on. if (hours == 0) hours=hours_on if (test_seen) { print "test="hours_on print "test_type="test_type print "test_status="status print "test_progress="progress } # Not all drives report hours_on if (hours_on && hours) { total_hours_ago=(hours_on-hours) days_ago=int(total_hours_ago/24) hours_ago=(total_hours_ago % 24) if (days_ago != 0) ago_str=days_ago"d" if (hours_ago !=0) ago_str=ago_str""hours_ago"h" print "test_ended="ago_str } } END {print "type="type; ORS="\n"; print ""} '); fi type=$(echo "$out" | grep '^type=' | cut -d '=' -f 2) # If type is not set by now, either we don't have a block device # or smartctl failed. Either way, default to ATA and set $out to # nothing. if [ -z "$type" ]; then type="sata" out= fi case $script in smart) # Print temperature plus common predictors of drive failure if [ "$type" = "sas" ] ; then scripts="temp|health|r_ucor|w_ucor" elif [ "$type" = "sata" ] ; then scripts="temp|health|ata_err|realloc|rep_ucor|cmd_to|pend_sec|off_ucor" elif [ "$type" = "nvme" ] ; then scripts="temp|health|nvme_err" fi ;; smartx) # Print some other interesting stats if [ "$type" = "sas" ] ; then scripts="hours_on|defect|nonmed|r_proc|w_proc" elif [ "$type" = "sata" ] ; then scripts="hours_on|pwr_cyc" elif [ "$type" = "nvme" ] ; then scripts="hours_on|pwr_cyc" fi ;; smart_test) scripts="test_type|test_status|test_progress|test_ended" ;; *) scripts="$script" esac with_vals=$(echo "$out" | grep -E "$scripts") if [ -n "$with_vals" ]; then echo "$with_vals" without_vals=$(echo "$scripts" | tr "|" "\n" | grep -v -E "$(echo "$with_vals" | awk -F "=" '{print $1}')" | awk '{print $0"="}') else without_vals=$(echo "$scripts" | tr "|" "\n" | awk '{print $0"="}') fi if [ -n "$without_vals" ]; then echo "$without_vals" fi