diff --git a/configure.ac b/configure.ac index 15db501914..3dd1cb0812 100644 --- a/configure.ac +++ b/configure.ac @@ -130,6 +130,7 @@ AC_CONFIG_FILES([ Makefile zfs/lib/libspl/include/sys/Makefile zfs/lib/libspl/include/Makefile zfs/lib/libspl/Makefile + zfs/lib/libzpios/Makefile zfs/zcmd/ztest/Makefile zfs/zcmd/Makefile zfs/zcmd/zfs/Makefile @@ -137,5 +138,6 @@ AC_CONFIG_FILES([ Makefile zfs/zcmd/zinject/Makefile zfs/zcmd/zdump/Makefile zfs/zcmd/zpool/Makefile + zfs/zcmd/zpios/Makefile ]) AC_OUTPUT diff --git a/scripts/profile-zpios-disk.sh b/scripts/profile-zpios-disk.sh new file mode 100644 index 0000000000..38607a5492 --- /dev/null +++ b/scripts/profile-zpios-disk.sh @@ -0,0 +1,128 @@ +#!/bin/bash +# profile-zpios-disk.sh +# +# /proc/diskinfo +# Field 1 -- device name +# Field 2 -- # of reads issued +# Field 3 -- # of reads merged +# Field 4 -- # of sectors read +# Field 5 -- # of milliseconds spent reading +# Field 6 -- # of writes completed +# Field 7 -- # of writes merged +# Field 8 -- # of sectors written +# Field 9 -- # of milliseconds spent writing +# Field 10 -- # of I/Os currently in progress +# Field 11 -- # of milliseconds spent doing I/Os +# Field 12 -- weighted # of milliseconds spent doing I/Os + +RUN_PIDS=${0} +RUN_LOG_DIR=${1} +RUN_ID=${2} + +create_table() { + local FIELD=$1 + local ROW_M=() + local ROW_N=() + local HEADER=1 + local STEP=1 + + for DISK_FILE in `ls -r --sort=time --time=ctime ${RUN_LOG_DIR}/${RUN_ID}/disk-[0-9]*`; do + ROW_M=( ${ROW_N[@]} ) + ROW_N=( `cat ${DISK_FILE} | grep sd | cut -c11- | cut -f${FIELD} -d' ' | tr "\n" "\t"` ) + + if [ $HEADER -eq 1 ]; then + echo -n "step, " + cat ${DISK_FILE} | grep sd | cut -c11- | cut -f1 -d' ' | tr "\n" ", " + echo "total" + HEADER=0 + fi + + if [ ${#ROW_M[@]} -eq 0 ]; then + continue + fi + + if [ ${#ROW_M[@]} -ne ${#ROW_N[@]} ]; then + echo "Badly formatted profile data in ${DISK_FILE}" + break + fi + + TOTAL=0 + echo -n "${STEP}, " + for (( i=0; i<${#ROW_N[@]}; i++ )); do + DELTA=`echo "${ROW_N[${i}]}-${ROW_M[${i}]}" | bc` + let TOTAL=${TOTAL}+${DELTA} + echo -n "${DELTA}, " + done + echo "${TOTAL}, " + + let STEP=${STEP}+1 + done +} + +create_table_mbs() { + local FIELD=$1 + local TIME=$2 + local ROW_M=() + local ROW_N=() + local HEADER=1 + local STEP=1 + + for DISK_FILE in `ls -r --sort=time --time=ctime ${RUN_LOG_DIR}/${RUN_ID}/disk-[0-9]*`; do + ROW_M=( ${ROW_N[@]} ) + ROW_N=( `cat ${DISK_FILE} | grep sd | cut -c11- | cut -f${FIELD} -d' ' | tr "\n" "\t"` ) + + if [ $HEADER -eq 1 ]; then + echo -n "step, " + cat ${DISK_FILE} | grep sd | cut -c11- | cut -f1 -d' ' | tr "\n" ", " + echo "total" + HEADER=0 + fi + + if [ ${#ROW_M[@]} -eq 0 ]; then + continue + fi + + if [ ${#ROW_M[@]} -ne ${#ROW_N[@]} ]; then + echo "Badly formatted profile data in ${DISK_FILE}" + break + fi + + TOTAL=0 + echo -n "${STEP}, " + for (( i=0; i<${#ROW_N[@]}; i++ )); do + DELTA=`echo "${ROW_N[${i}]}-${ROW_M[${i}]}" | bc` + MBS=`echo "scale=2; ((${DELTA}*512)/${TIME})/(1024*1024)" | bc` + TOTAL=`echo "scale=2; ${TOTAL}+${MBS}" | bc` + echo -n "${MBS}, " + done + echo "${TOTAL}, " + + let STEP=${STEP}+1 + done +} + +echo +echo "Reads issued per device" +create_table 2 +echo +echo "Reads merged per device" +create_table 3 +echo +echo "Sectors read per device" +create_table 4 +echo "MB/s per device" +create_table_mbs 4 3 + +echo +echo "Writes issued per device" +create_table 6 +echo +echo "Writes merged per device" +create_table 7 +echo +echo "Sectors written per device" +create_table 8 +echo "MB/s per device" +create_table_mbs 8 3 + +exit 0 diff --git a/scripts/profile-zpios-pids.sh b/scripts/profile-zpios-pids.sh new file mode 100644 index 0000000000..4d8adb1a15 --- /dev/null +++ b/scripts/profile-zpios-pids.sh @@ -0,0 +1,130 @@ +#!/bin/bash +# profile-zpios-pids.sh + +RUN_PIDS=${0} +RUN_LOG_DIR=${1} +RUN_ID=${2} + +ROW_M=() +ROW_N=() +ROW_N_SCHED=() +ROW_N_WAIT=() + +HEADER=1 +STEP=1 + +for PID_FILE in `ls -r --sort=time --time=ctime ${RUN_LOG_DIR}/${RUN_ID}/pids-[0-9]*`; do + ROW_M=( ${ROW_N[@]} ) + ROW_N=( 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ) + ROW_N_SCHED=( `cat ${PID_FILE} | cut -f15 -d' ' | tr "\n" "\t"` ) + ROW_N_WAIT=( `cat ${PID_FILE} | cut -f17 -d' ' | tr "\n" "\t"` ) + ROW_N_NAMES=( `cat ${PID_FILE} | cut -f2 -d' ' | cut -f2 -d'(' | + cut -f1 -d')' | cut -f1 -d'/' | tr "\n" "\t"` ) + + for (( i=0; i<${#ROW_N_SCHED[@]}; i++ )); do + SUM=`echo "${ROW_N_WAIT[${i}]}+${ROW_N_SCHED[${i}]}" | bc` + + case ${ROW_N_NAMES[${i}]} in + zio_taskq) IDX=0;; + zio_req_nul) IDX=1;; + zio_irq_nul) IDX=2;; + zio_req_rd) IDX=3;; + zio_irq_rd) IDX=4;; + zio_req_wr) IDX=5;; + zio_irq_wr) IDX=6;; + zio_req_fr) IDX=7;; + zio_irq_fr) IDX=8;; + zio_req_cm) IDX=9;; + zio_irq_cm) IDX=10;; + zio_req_ctl) IDX=11;; + zio_irq_ctl) IDX=12;; + txg_quiesce) IDX=13;; + txg_sync) IDX=14;; + txg_timelimit) IDX=15;; + arc_reclaim) IDX=16;; + l2arc_feed) IDX=17;; + kpios_io) IDX=18;; + *) continue;; + esac + + let ROW_N[${IDX}]=${ROW_N[${IDX}]}+${SUM} + done + + if [ $HEADER -eq 1 ]; then + echo "step, zio_taskq, zio_req_nul, zio_irq_nul, " \ + "zio_req_rd, zio_irq_rd, zio_req_wr, zio_irq_wr, " \ + "zio_req_fr, zio_irq_fr, zio_req_cm, zio_irq_cm, " \ + "zio_req_ctl, zio_irq_ctl, txg_quiesce, txg_sync, " \ + "txg_timelimit, arc_reclaim, l2arc_feed, kpios_io, " \ + "idle" + HEADER=0 + fi + + if [ ${#ROW_M[@]} -eq 0 ]; then + continue + fi + + if [ ${#ROW_M[@]} -ne ${#ROW_N[@]} ]; then + echo "Badly formatted profile data in ${PID_FILE}" + break + fi + + # Original values are in jiffies and we expect HZ to be 1000 + # on most 2.6 systems thus we divide by 10 to get a percentage. + IDLE=1000 + echo -n "${STEP}, " + for (( i=0; i<${#ROW_N[@]}; i++ )); do + DELTA=`echo "${ROW_N[${i}]}-${ROW_M[${i}]}" | bc` + DELTA_PERCENT=`echo "scale=1; ${DELTA}/10" | bc` + let IDLE=${IDLE}-${DELTA} + echo -n "${DELTA_PERCENT}, " + done + ILDE_PERCENT=`echo "scale=1; ${IDLE}/10" | bc` + echo "${ILDE_PERCENT}" + + let STEP=${STEP}+1 +done + +exit + +echo +echo "Percent of total system time per pid" +for PID_FILE in `ls -r --sort=time --time=ctime ${RUN_LOG_DIR}/${RUN_ID}/pids-[0-9]*`; do + ROW_M=( ${ROW_N[@]} ) + ROW_N_SCHED=( `cat ${PID_FILE} | cut -f15 -d' ' | tr "\n" "\t"` ) + ROW_N_WAIT=( `cat ${PID_FILE} | cut -f17 -d' ' | tr "\n" "\t"` ) + + for (( i=0; i<${#ROW_N_SCHED[@]}; i++ )); do + ROW_N[${i}]=`echo "${ROW_N_WAIT[${i}]}+${ROW_N_SCHED[${i}]}" | bc` + done + + if [ $HEADER -eq 1 ]; then + echo -n "step, " + cat ${PID_FILE} | cut -f2 -d' ' | tr "\n" ", " + echo + HEADER=0 + fi + + if [ ${#ROW_M[@]} -eq 0 ]; then + continue + fi + + if [ ${#ROW_M[@]} -ne ${#ROW_N[@]} ]; then + echo "Badly formatted profile data in ${PID_FILE}" + break + fi + + # Original values are in jiffies and we expect HZ to be 1000 + # on most 2.6 systems thus we divide by 10 to get a percentage. + echo -n "${STEP}, " + for (( i=0; i<${#ROW_N[@]}; i++ )); do + DELTA=`echo "scale=1; (${ROW_N[${i}]}-${ROW_M[${i}]})/10" | bc` + echo -n "${DELTA}, " + done + + echo + let STEP=${STEP}+1 +done + + +exit 0 diff --git a/scripts/profile-zpios-post.sh b/scripts/profile-zpios-post.sh new file mode 100644 index 0000000000..16bc590f80 --- /dev/null +++ b/scripts/profile-zpios-post.sh @@ -0,0 +1,67 @@ +#!/bin/bash + +prog=profile-zpios-post.sh +. ../.script-config + +RUN_POST=${0} +RUN_PHASE=${1} +RUN_LOG_DIR=${2} +RUN_ID=${3} +RUN_POOL=${4} +RUN_CHUNK_SIZE=${5} +RUN_REGION_SIZE=${6} +RUN_THREAD_COUNT=${7} +RUN_REGION_COUNT=${8} +RUN_OFFSET=${9} +RUN_REGION_NOISE=${10} +RUN_CHUNK_NOISE=${11} +RUN_THREAD_DELAY=${12} +RUN_FLAGS=${13} +RUN_RESULT=${14} + +PROFILE_ZPIOS_PIDS_BIN=/home/behlendo/src/zfs/scripts/profile-zpios-pids.sh +PROFILE_ZPIOS_PIDS_LOG=${RUN_LOG_DIR}/${RUN_ID}/pids-summary.csv + +PROFILE_ZPIOS_DISK_BIN=/home/behlendo/src/zfs/scripts/profile-zpios-disk.sh +PROFILE_ZPIOS_DISK_LOG=${RUN_LOG_DIR}/${RUN_ID}/disk-summary.csv + +PROFILE_ZPIOS_ARC_LOG=${RUN_LOG_DIR}/${RUN_ID}/arcstats +PROFILE_ZPIOS_VDEV_LOG=${RUN_LOG_DIR}/${RUN_ID}/vdev_cache_stats + +KERNEL_BIN="/lib/modules/`uname -r`/kernel/" +SPL_BIN="${SPLBUILD}/modules/spl/" +ZFS_BIN="${ZFSBUILD}/lib/" + +OPROFILE_SHORT_ARGS="-a -g -l -p ${KERNEL_BIN},${SPL_BIN},${ZFS_BIN}" +OPROFILE_LONG_ARGS="-d -a -g -l -p ${KERNEL_BIN},${SPL_BIN},${ZFS_BIN}" + +OPROFILE_LOG=${RUN_LOG_DIR}/${RUN_ID}/oprofile.txt +OPROFILE_SHORT_LOG=${RUN_LOG_DIR}/${RUN_ID}/oprofile-short.txt +OPROFILE_LONG_LOG=${RUN_LOG_DIR}/${RUN_ID}/oprofile-long.txt +PROFILE_PID=${RUN_LOG_DIR}/${RUN_ID}/pid + +if [ "${RUN_PHASE}" != "post" ]; then + exit 1 +fi + +# opcontrol --stop >>${OPROFILE_LOG} 2>&1 +# opcontrol --dump >>${OPROFILE_LOG} 2>&1 + +kill -s SIGHUP `cat ${PROFILE_PID}` +rm -f ${PROFILE_PID} + +# opreport ${OPROFILE_SHORT_ARGS} >${OPROFILE_SHORT_LOG} 2>&1 +# opreport ${OPROFILE_LONG_ARGS} >${OPROFILE_LONG_LOG} 2>&1 + +# opcontrol --deinit >>${OPROFILE_LOG} 2>&1 + +cat /proc/spl/kstat/zfs/arcstats >${PROFILE_ZPIOS_ARC_LOG} +cat /proc/spl/kstat/zfs/vdev_cache_stats >${PROFILE_ZPIOS_VDEV_LOG} + +# Summarize system time per pid +${PROFILE_ZPIOS_PIDS_BIN} ${RUN_LOG_DIR} ${RUN_ID} >${PROFILE_ZPIOS_PIDS_LOG} + +# Summarize per device performance +${PROFILE_ZPIOS_DISK_BIN} ${RUN_LOG_DIR} ${RUN_ID} >${PROFILE_ZPIOS_DISK_LOG} + +exit 0 diff --git a/scripts/profile-zpios-pre.sh b/scripts/profile-zpios-pre.sh new file mode 100644 index 0000000000..675b80240a --- /dev/null +++ b/scripts/profile-zpios-pre.sh @@ -0,0 +1,69 @@ +#!/bin/bash +# profile-zpios-pre.sh + +trap "PROFILE_ZPIOS_READY=1" SIGHUP + +RUN_PRE=${0} +RUN_PHASE=${1} +RUN_LOG_DIR=${2} +RUN_ID=${3} +RUN_POOL=${4} +RUN_CHUNK_SIZE=${5} +RUN_REGION_SIZE=${6} +RUN_THREAD_COUNT=${7} +RUN_REGION_COUNT=${8} +RUN_OFFSET=${9} +RUN_REGION_NOISE=${10} +RUN_CHUNK_NOISE=${11} +RUN_THREAD_DELAY=${12} +RUN_FLAGS=${13} +RUN_RESULT=${14} + +PROFILE_ZPIOS_BIN=/home/behlendo/src/zfs/scripts/profile-zpios.sh +PROFILE_ZPIOS_READY=0 + +OPROFILE_LOG=${RUN_LOG_DIR}/${RUN_ID}/oprofile.txt +PROFILE_PID=${RUN_LOG_DIR}/${RUN_ID}/pid +RUN_ARGS=${RUN_LOG_DIR}/${RUN_ID}/args + +if [ "${RUN_PHASE}" != "pre" ]; then + exit 1 +fi + +rm -Rf ${RUN_LOG_DIR}/${RUN_ID}/ +mkdir -p ${RUN_LOG_DIR}/${RUN_ID}/ + +echo "PHASE=${RUN_PHASE}" >>${RUN_ARGS} +echo "LOG_DIR=${RUN_LOG_DIR}" >>${RUN_ARGS} +echo "ID=${RUN_ID}" >>${RUN_ARGS} +echo "POOL=${RUN_POOL}" >>${RUN_ARGS} +echo "CHUNK_SIZE=${RUN_CHUNK_SIZE}" >>${RUN_ARGS} +echo "REGION_SIZE=${RUN_REGION_SIZE}" >>${RUN_ARGS} +echo "THREAD_COUNT=${RUN_THREAD_COUNT}" >>${RUN_ARGS} +echo "REGION_COUNT=${RUN_REGION_COUNT}" >>${RUN_ARGS} +echo "OFFSET=${RUN_OFFSET}" >>${RUN_ARGS} +echo "REGION_NOISE=${RUN_REGION_NOISE}" >>${RUN_ARGS} +echo "CHUNK_NOISE=${RUN_CHUNK_NOISE}" >>${RUN_ARGS} +echo "THREAD_DELAY=${RUN_THREAD_DELAY}" >>${RUN_ARGS} +echo "FLAGS=${RUN_FLAGS}" >>${RUN_ARGS} +echo "RESULT=${RUN_RESULT}" >>${RUN_ARGS} + +# XXX: Oprofile support seems to be broken when I try and start +# it via a user mode helper script, I suspect the setup is failing. +# opcontrol --init >>${OPROFILE_LOG} 2>&1 +# opcontrol --setup --vmlinux=/boot/vmlinux >>${OPROFILE_LOG} 2>&1 + +# Start the profile script +${PROFILE_ZPIOS_BIN} ${RUN_PHASE} ${RUN_LOG_DIR} ${RUN_ID} & +echo "$!" >${PROFILE_PID} + +# Sleep waiting for profile script to be ready, it will +# signal us via SIGHUP when it is ready to start profiling. +while [ ${PROFILE_ZPIOS_READY} -eq 0 ]; do + sleep 0.1 +done + +# opcontrol --start-daemon >>${OPROFILE_LOG} 2>&1 +# opcontrol --start >>${OPROFILE_LOG} 2>&1 + +exit 0 diff --git a/scripts/profile-zpios.sh b/scripts/profile-zpios.sh new file mode 100644 index 0000000000..2ea110c536 --- /dev/null +++ b/scripts/profile-zpios.sh @@ -0,0 +1,222 @@ +#!/bin/bash +# profile-zpios.sh + +trap "RUN_DONE=1" SIGHUP + +RUN_PHASE=${1} +RUN_LOG_DIR=${2} +RUN_ID=${3} +RUN_DONE=0 + +POLL_INTERVAL=2.99 + +# Log these pids, the exact pid numbers will vary from system to system +# so I harvest pid for all the following type of processes from /proc// +# +# zio_taskq/# +# spa_zio_issue/# +# spa_zio_intr/# +# txg_quiesce_thr +# txg_sync_thread +# txg_timelimit_t +# arc_reclaim_thr +# l2arc_feed_thre +# kpios_io/# + +ZIO_TASKQ_PIDS=() +ZIO_REQ_NUL_PIDS=() +ZIO_IRQ_NUL_PIDS=() +ZIO_REQ_RD_PIDS=() +ZIO_IRQ_RD_PIDS=() +ZIO_REQ_WR_PIDS=() +ZIO_IRQ_WR_PIDS=() +ZIO_REQ_FR_PIDS=() +ZIO_IRQ_FR_PIDS=() +ZIO_REQ_CM_PIDS=() +ZIO_IRQ_CM_PIDS=() +ZIO_REQ_CTL_PIDS=() +ZIO_IRQ_CTL_PIDS=() + +TXG_QUIESCE_PIDS=() +TXG_SYNC_PIDS=() +TXG_TIMELIMIT_PIDS=() + +ARC_RECLAIM_PIDS=() +L2ARC_FEED_PIDS=() + +KPIOS_IO_PIDS=() + +show_pids() { + echo "* zio_taskq: { ${ZIO_TASKQ_PIDS[@]} } = ${#ZIO_TASKQ_PIDS[@]}" + echo "* zio_req_nul: { ${ZIO_REQ_NUL_PIDS[@]} } = ${#ZIO_REQ_NUL_PIDS[@]}" + echo "* zio_irq_nul: { ${ZIO_IRQ_NUL_PIDS[@]} } = ${#ZIO_IRQ_NUL_PIDS[@]}" + echo "* zio_req_rd: { ${ZIO_REQ_RD_PIDS[@]} } = ${#ZIO_REQ_RD_PIDS[@]}" + echo "* zio_irq_rd: { ${ZIO_IRQ_RD_PIDS[@]} } = ${#ZIO_IRQ_RD_PIDS[@]}" + echo "* zio_req_wr: { ${ZIO_REQ_WR_PIDS[@]} } = ${#ZIO_REQ_WR_PIDS[@]}" + echo "* zio_irq_wr: { ${ZIO_IRQ_WR_PIDS[@]} } = ${#ZIO_IRQ_WR_PIDS[@]}" + echo "* zio_req_fr: { ${ZIO_REQ_FR_PIDS[@]} } = ${#ZIO_REQ_FR_PIDS[@]}" + echo "* zio_irq_fr: { ${ZIO_IRQ_FR_PIDS[@]} } = ${#ZIO_IRQ_FR_PIDS[@]}" + echo "* zio_req_cm: { ${ZIO_REQ_CM_PIDS[@]} } = ${#ZIO_REQ_CM_PIDS[@]}" + echo "* zio_irq_cm: { ${ZIO_IRQ_CM_PIDS[@]} } = ${#ZIO_IRQ_CM_PIDS[@]}" + echo "* zio_req_ctl: { ${ZIO_REQ_CTL_PIDS[@]} } = ${#ZIO_REQ_CTL_PIDS[@]}" + echo "* zio_irq_ctl: { ${ZIO_IRQ_CTL_PIDS[@]} } = ${#ZIO_IRQ_CTL_PIDS[@]}" + echo "* txg_quiesce: { ${TXG_QUIESCE_PIDS[@]} } = ${#TXG_QUIESCE_PIDS[@]}" + echo "* txg_sync: { ${TXG_SYNC_PIDS[@]} } = ${#TXG_SYNC_PIDS[@]}" + echo "* txg_timelimit: { ${TXG_TIMELIMIT_PIDS[@]} } = ${#TXG_TIMELIMIT_PIDS[@]}" + echo "* arc_reclaim: { ${ARC_RECLAIM_PIDS[@]} } = ${#ARC_RECLAIM_PIDS[@]}" + echo "* l2arc_feed: { ${L2ARC_FEED_PIDS[@]} } = ${#L2ARC_FEED_PIDS[@]}" + echo "* kpios_io: { ${KPIOS_IO_PIDS[@]} } = ${#KPIOS_IO_PIDS[@]}" +} + +check_pid() { + local PID=$1 + local NAME=$2 + local TYPE=$3 + local PIDS=( "$4" ) + local NAME_STRING=`echo ${NAME} | cut -f1 -d'/'` + local NAME_NUMBER=`echo ${NAME} | cut -f2 -d'/'` + + if [ "${NAME_STRING}" == "${TYPE}" ]; then + if [ -n "${NAME_NUMBER}" ]; then + PIDS[${NAME_NUMBER}]=${PID} + else + PIDS[${#PIDS[@]}]=${PID} + + fi + fi + + echo "${PIDS[@]}" +} + +# NOTE: This whole process is crazy slow but it will do for now +aquire_pids() { + echo "--- Aquiring ZFS pids ---" + + for PID in `ls /proc/ | grep [0-9] | sort -n -u`; do + if [ ! -e /proc/${PID}/status ]; then + continue + fi + + NAME=`cat /proc/${PID}/status | head -n1 | cut -f2` + + ZIO_TASKQ_PIDS=( `check_pid ${PID} ${NAME} "zio_taskq" \ + "$(echo "${ZIO_TASKQ_PIDS[@]}")"` ) + + ZIO_REQ_NUL_PIDS=( `check_pid ${PID} ${NAME} "zio_req_nul" \ + "$(echo "${ZIO_REQ_NUL_PIDS[@]}")"` ) + + ZIO_IRQ_NUL_PIDS=( `check_pid ${PID} ${NAME} "zio_irq_nul" \ + "$(echo "${ZIO_IRQ_NUL_PIDS[@]}")"` ) + + ZIO_REQ_RD_PIDS=( `check_pid ${PID} ${NAME} "zio_req_rd" \ + "$(echo "${ZIO_REQ_RD_PIDS[@]}")"` ) + + ZIO_IRQ_RD_PIDS=( `check_pid ${PID} ${NAME} "zio_irq_rd" \ + "$(echo "${ZIO_IRQ_RD_PIDS[@]}")"` ) + + ZIO_REQ_WR_PIDS=( `check_pid ${PID} ${NAME} "zio_req_wr" \ + "$(echo "${ZIO_REQ_WR_PIDS[@]}")"` ) + + ZIO_IRQ_WR_PIDS=( `check_pid ${PID} ${NAME} "zio_irq_wr" \ + "$(echo "${ZIO_IRQ_WR_PIDS[@]}")"` ) + + ZIO_REQ_FR_PIDS=( `check_pid ${PID} ${NAME} "zio_req_fr" \ + "$(echo "${ZIO_REQ_FR_PIDS[@]}")"` ) + + ZIO_IRQ_FR_PIDS=( `check_pid ${PID} ${NAME} "zio_irq_fr" \ + "$(echo "${ZIO_IRQ_FR_PIDS[@]}")"` ) + + ZIO_REQ_CM_PIDS=( `check_pid ${PID} ${NAME} "zio_req_cm" \ + "$(echo "${ZIO_REQ_CM_PIDS[@]}")"` ) + + ZIO_IRQ_CM_PIDS=( `check_pid ${PID} ${NAME} "zio_irq_cm" \ + "$(echo "${ZIO_IRQ_CM_PIDS[@]}")"` ) + + ZIO_REQ_CTL_PIDS=( `check_pid ${PID} ${NAME} "zio_req_ctl" \ + "$(echo "${ZIO_REQ_CTL_PIDS[@]}")"` ) + + ZIO_IRQ_CTL_PIDS=( `check_pid ${PID} ${NAME} "zio_irq_ctl" \ + "$(echo "${ZIO_IRQ_CTL_PIDS[@]}")"` ) + + TXG_QUIESCE_PIDS=( `check_pid ${PID} ${NAME} "txg_quiesce" \ + "$(echo "${TXG_QUIESCE_PIDS[@]}")"` ) + + TXG_SYNC_PIDS=( `check_pid ${PID} ${NAME} "txg_sync" \ + "$(echo "${TXG_SYNC_PIDS[@]}")"` ) + + TXG_TIMELIMIT_PIDS=( `check_pid ${PID} ${NAME} "txg_timelimit" \ + "$(echo "${TXG_TIMELIMIT_PIDS[@]}")"` ) + + ARC_RECLAIM_PIDS=( `check_pid ${PID} ${NAME} "arc_reclaim" \ + "$(echo "${ARC_RECLAIM_PIDS[@]}")"` ) + + L2ARC_FEED_PIDS=( `check_pid ${PID} ${NAME} "l2arc_feed" \ + "$(echo "${L2ARC_FEED_PIDS[@]}")"` ) + done + + # Wait for kpios_io threads to start + kill -s SIGHUP ${PPID} + echo "* Waiting for kpios_io threads to start" + while [ ${RUN_DONE} -eq 0 ]; do + KPIOS_IO_PIDS=( `ps ax | grep kpios_io | grep -v grep | \ + sed 's/^ *//g' | cut -f1 -d' '` ) + if [ ${#KPIOS_IO_PIDS[@]} -gt 0 ]; then + break; + fi + sleep 0.1 + done + + echo "`show_pids`" >${RUN_LOG_DIR}/${RUN_ID}/pids.txt +} + +log_pids() { + echo "--- Logging ZFS profile to ${RUN_LOG_DIR}/${RUN_ID}/ ---" + ALL_PIDS=( ${ZIO_TASKQ_PIDS[@]} \ + ${ZIO_REQ_NUL_PIDS[@]} \ + ${ZIO_IRQ_NUL_PIDS[@]} \ + ${ZIO_REQ_RD_PID[@]} \ + ${ZIO_IRQ_RD_PIDS[@]} \ + ${ZIO_REQ_WR_PIDS[@]} \ + ${ZIO_IRQ_WR_PIDS[@]} \ + ${ZIO_REQ_FR_PIDS[@]} \ + ${ZIO_IRQ_FR_PIDS[@]} \ + ${ZIO_REQ_CM_PIDS[@]} \ + ${ZIO_IRQ_CM_PIDS[@]} \ + ${ZIO_REQ_CTL_PIDS[@]} \ + ${ZIO_IRQ_CTL_PIDS[@]} \ + ${TXG_QUIESCE_PIDS[@]} \ + ${TXG_SYNC_PIDS[@]} \ + ${TXG_TIMELIMIT_PIDS[@]} \ + ${ARC_RECLAIM_PIDS[@]} \ + ${L2ARC_FEED_PIDS[@]} \ + ${KPIOS_IO_PIDS[@]} ) + + while [ ${RUN_DONE} -eq 0 ]; do + NOW=`date +%s.%N` + LOG_PIDS="${RUN_LOG_DIR}/${RUN_ID}/pids-${NOW}" + LOG_DISK="${RUN_LOG_DIR}/${RUN_ID}/disk-${NOW}" + + for PID in "${ALL_PIDS[@]}"; do + if [ -z ${PID} ]; then + continue; + fi + + if [ -e /proc/${PID}/stat ]; then + cat /proc/${PID}/stat | head -n1 >>${LOG_PIDS} + else + echo "<${PID} exited>" >>${LOG_PIDS} + fi + done + + cat /proc/diskstats >${LOG_DISK} + + NOW2=`date +%s.%N` + DELTA=`echo "${POLL_INTERVAL}-(${NOW2}-${NOW})" | bc` + sleep ${DELTA} + done +} + +aquire_pids +log_pids + +exit 0 diff --git a/scripts/survey.sh b/scripts/survey.sh new file mode 100644 index 0000000000..a75bb2d6f0 --- /dev/null +++ b/scripts/survey.sh @@ -0,0 +1,102 @@ +#!/bin/bash + +prog=survey.sh +. ../.script-config + +LOG=/home/`whoami`/zpios-logs/`uname -r`/zpios-`date +%Y%m%d`/ +mkdir -p ${LOG} + +# Apply all tunings described below to generate some best case +# numbers for what is acheivable with some more elbow grease. +NAME="prefetch+zerocopy+checksum+pending1024+kmem" +echo "----------------------- ${NAME} ------------------------------" +./zpios.sh \ + "" \ + "zfs_prefetch_disable=1 zfs_vdev_max_pending=1024 zio_bulk_flags=0x100" \ + "--zerocopy" \ + ${LOG}/${NAME}/ \ + "${CMDDIR}/zfs/zfs set checksum=off lustre" | \ + tee ${LOG}/${NAME}.txt + +# Baseline number for an out of the box config with no manual tuning. +# Ideally, we will want things to be automatically tuned and for this +# number to approach the tweaked out results above. +NAME="baseline" +echo "----------------------- ${NAME} ------------------------------" +./zpios.sh \ + "" \ + "" \ + "" \ + ${LOG}/${NAME}/ | \ + tee ${LOG}/${NAME}.txt + +# Disable ZFS's prefetching. For some reason still not clear to me +# current prefetching policy is quite bad for a random workload. +# Allow the algorithm to detect a random workload and not do anything +# may be the way to address this issue. +NAME="prefetch" +echo "----------------------- ${NAME} ------------------------------" +./zpios.sh \ + "" \ + "zfs_prefetch_disable=1" \ + "" \ + ${LOG}/${NAME}/ | \ + tee ${LOG}/${NAME}.txt + +# As expected, simulating a zerocopy IO path improves performance +# by freeing up lots of CPU which is wasted move data between buffers. +NAME="zerocopy" +echo "----------------------- ${NAME} ------------------------------" +./zpios.sh \ + "" \ + "" \ + "--zerocopy" \ + ${LOG}/${NAME}/ | \ + tee ${LOG}/${NAME}.txt + +# Disabling checksumming should show some (if small) improvement +# simply due to freeing up a modest amount of CPU. +NAME="checksum" +echo "----------------------- ${NAME} ------------------------------" +./zpios.sh \ + "" \ + "" \ + "" \ + ${LOG}/${NAME}/ \ + "${CMDDIR}/zfs/zfs set checksum=off lustre" | \ + tee ${LOG}/${NAME}.txt + +# Increasing the pending IO depth also seems to improve things likely +# at the expense of latency. This should be exported more because I'm +# seeing a much bigger impact there that I would have expected. There +# may be some low hanging fruit to be found here. +NAME="pending" +echo "----------------------- ${NAME} ------------------------------" +./zpios.sh \ + "" \ + "zfs_vdev_max_pending=1024" \ + "" \ + ${LOG}/${NAME}/ | \ + tee ${LOG}/${NAME}.txt + +# To avoid memory fragmentation issues our slab implementation can be +# based on a virtual address space. Interestingly, we take a pretty +# substantial performance penalty for this somewhere in the low level +# IO drivers. If we back the slab with kmem pages we see far better +# read performance numbers at the cost of memory fragmention and general +# system instability due to large allocations. This may be because of +# an optimization in the low level drivers due to the contigeous kmem +# based memory. This needs to be explained. The good news here is that +# with zerocopy interfaces added at the DMU layer we could gaurentee +# kmem based memory for a pool of pages. +# +# 0x100 = KMC_KMEM - Force kmem_* based slab +# 0x200 = KMC_VMEM - Force vmem_* based slab +NAME="kmem" +echo "----------------------- ${NAME} ------------------------------" +./zpios.sh \ + "" \ + "zio_bulk_flags=0x100" \ + "" \ + ${LOG}/${NAME}/ | \ + tee ${LOG}/${NAME}.txt diff --git a/scripts/zpios.sh b/scripts/zpios.sh new file mode 100644 index 0000000000..1b91c5cc55 --- /dev/null +++ b/scripts/zpios.sh @@ -0,0 +1,133 @@ +#!/bin/bash + +prog=zpios.sh +. ../.script-config + +SPL_OPTIONS="spl_debug_mask=0 spl_debug_subsys=0 ${1}" +ZPOOL_OPTIONS=$2 +ZPIOS_OPTIONS=$3 +PROFILE_ZPIOS_LOGS=$4 +ZPIOS_PRE=$5 +ZPIOS_POST=$6 + +PROFILE_ZPIOS_PRE=/home/behlendo/src/zfs/scripts/profile-zpios-pre.sh +PROFILE_ZPIOS_POST=/home/behlendo/src/zfs/scripts/profile-zpios-post.sh + +DEVICES="/dev/hda" + +echo ------------------------- ZFS TEST LOG --------------------------------- +echo -n "Date = "; date +echo -n "Kernel = "; uname -r +echo ------------------------------------------------------------------------ + +echo +./load-zfs.sh "${SPL_OPTIONS}" "${ZPOOL_OPTIONS}" + +echo ---------------------- SPL Sysctl Tunings ------------------------------ +sysctl -A | grep spl +echo + +echo ------------------- SPL/ZPOOL Module Tunings --------------------------- +if [ -d /sys/module/spl/parameters ]; then + grep [0-9] /sys/module/spl/parameters/* + grep [0-9] /sys/module/zpool/parameters/* +else + grep [0-9] /sys/module/spl/* + grep [0-9] /sys/module/zpool/* +fi +echo + +echo "${CMDDIR}/zpool/zpool create -f lustre ${DEVICES}" +${CMDDIR}/zpool/zpool create -f lustre ${DEVICES} + +echo "${CMDDIR}/zpool/zpool status lustre" +${CMDDIR}/zpool/zpool status lustre + +echo "Waiting for /dev/zpios to come up..." +while [ ! -c /dev/zpios ]; do + sleep 1 +done + +if [ -n "${ZPIOS_PRE}" ]; then + ${ZPIOS_PRE} +fi + +# Usage: zpios +# --chunksize -c =values +# --chunksize_low -a =value +# --chunksize_high -b =value +# --chunksize_incr -g =value +# --offset -o =values +# --offset_low -m =value +# --offset_high -q =value +# --offset_incr -r =value +# --regioncount -n =values +# --regioncount_low -i =value +# --regioncount_high -j =value +# --regioncount_incr -k =value +# --threadcount -t =values +# --threadcount_low -l =value +# --threadcount_high -h =value +# --threadcount_incr -e =value +# --regionsize -s =values +# --regionsize_low -A =value +# --regionsize_high -B =value +# --regionsize_incr -C =value +# --cleanup -x +# --verify -V +# --zerocopy -z +# --threaddelay -T =jiffies +# --regionnoise -I =shift +# --chunknoise -N =bytes +# --prerun -P =pre-command +# --postrun -R =post-command +# --log -G =log directory +# --pool | --path -p =pool name +# --load -L =dmuio +# --help -? =this help +# --verbose -v =increase verbosity + +# --prerun=${PROFILE_ZPIOS_PRE} \ +# --postrun=${PROFILE_ZPIOS_POST} \ + +CMD="${CMDDIR}/zpios/zpios \ + --load=dmuio \ + --path=lustre \ + --chunksize=1M \ + --regionsize=4M \ + --regioncount=64 \ + --threadcount=4 \ + --offset=4M \ + --cleanup \ + --verbose \ + --human-readable \ + ${ZPIOS_OPTIONS} \ + --log=${PROFILE_ZPIOS_LOGS}" +echo +date +echo ${CMD} +$CMD +date + +if [ -n "${ZPIOS_POST}" ]; then + ${ZPIOS_POST} +fi + +${CMDDIR}/zpool/zpool destroy lustre + +echo ---------------------- SPL Sysctl Tunings ------------------------------ +sysctl -A | grep spl +echo + +echo ------------------------ KSTAT Statistics ------------------------------ +echo ARCSTATS +cat /proc/spl/kstat/zfs/arcstats +echo +echo VDEV_CACHE_STATS +cat /proc/spl/kstat/zfs/vdev_cache_stats +echo +echo SLAB +cat /proc/spl/kmem/slab +echo + +./unload-zfs.sh diff --git a/zfs/lib/Makefile.in b/zfs/lib/Makefile.in index 8e328ca2df..575399286d 100644 --- a/zfs/lib/Makefile.in +++ b/zfs/lib/Makefile.in @@ -8,3 +8,5 @@ subdir-m += libzpool # Kernel | User SPA/DMU/ZVOL/ZPL subdir-m += libavl # Kernel &| User space AVL tree support subdir-m += libnvpair # Kernel &| User space name/value support subdir-m += libzcommon # Kernel &| User space common support + +subdir-m += libzpios # Kernel DMU test app diff --git a/zfs/lib/libzpios/Makefile.in b/zfs/lib/libzpios/Makefile.in new file mode 100644 index 0000000000..2aa1482a8d --- /dev/null +++ b/zfs/lib/libzpios/Makefile.in @@ -0,0 +1,17 @@ +DISTFILES = Makefile.in *.c + +MODULE := zpios + +# Compile as kernel module. Needed symlinks created for all +# k* objects created by top level configure script. + +EXTRA_CFLAGS = @KERNELCPPFLAGS@ +EXTRA_CFLAGS += -I@LIBDIR@/libzpios/include +EXTRA_CFLAGS += -I@LIBDIR@/libzcommon/include +EXTRA_CFLAGS += -I@LIBDIR@/libport/include +EXTRA_CFLAGS += -I@LIBDIR@/libavl/include +EXTRA_CFLAGS += -I@LIBDIR@/libnvpair/include + +obj-m := ${MODULE}.o + +${MODULE}-objs += kpios.o # Kernel PIOS test case diff --git a/zfs/lib/libzpios/include/Makefile.in b/zfs/lib/libzpios/include/Makefile.in new file mode 100644 index 0000000000..20e4bd7c9d --- /dev/null +++ b/zfs/lib/libzpios/include/Makefile.in @@ -0,0 +1 @@ +DISTFILES = kpios-ctl.h kpios-internal.h diff --git a/zfs/lib/libzpios/include/kpios-ctl.h b/zfs/lib/libzpios/include/kpios-ctl.h new file mode 100644 index 0000000000..c9c3526ade --- /dev/null +++ b/zfs/lib/libzpios/include/kpios-ctl.h @@ -0,0 +1,120 @@ +/* + * This file is part of the ZFS Linux port. + * + * Copyright (c) 2008 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory + * Written by: + * Brian Behlendorf , + * Herb Wartens , + * Jim Garlick + * LLNL-CODE-403049 + * + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#ifndef _KPIOS_CTL_H +#define _KPIOS_CTL_H + +/* Contains shared definitions which both the userspace + * and kernelspace portions of kpios must agree on. + */ +#ifndef _KERNEL +#include +#endif + +#define KPIOS_MAJOR 232 /* XXX - Arbitrary */ +#define KPIOS_MINORS 1 +#define KPIOS_DEV "/dev/kpios" + +#define DMU_IO 0x01 + +#define DMU_WRITE 0x01 +#define DMU_READ 0x02 +#define DMU_VERIFY 0x04 +#define DMU_REMOVE 0x08 +#define DMU_FPP 0x10 +#define DMU_WRITE_ZC 0x20 /* Incompatible with DMU_VERIFY */ +#define DMU_READ_ZC 0x40 /* Incompatible with DMU_VERIFY */ + +#define KPIOS_NAME_SIZE 16 +#define KPIOS_PATH_SIZE 128 + +#define PHASE_PRE "pre" +#define PHASE_POST "post" +#define PHASE_WRITE "write" +#define PHASE_READ "read" + +#define KPIOS_CFG_MAGIC 0x87237190U +typedef struct kpios_cfg { + uint32_t cfg_magic; /* Unique magic */ + int32_t cfg_cmd; /* Config command */ + int32_t cfg_arg1; /* Config command arg 1 */ + int32_t cfg_rc1; /* Config response 1 */ +} kpios_cfg_t; + +typedef struct kpios_time { + struct timespec start; + struct timespec stop; + struct timespec delta; +} kpios_time_t; + +typedef struct kpios_stats { + kpios_time_t total_time; + kpios_time_t cr_time; + kpios_time_t rm_time; + kpios_time_t wr_time; + kpios_time_t rd_time; + uint64_t wr_data; + uint64_t wr_chunks; + uint64_t rd_data; + uint64_t rd_chunks; +} kpios_stats_t; + +#define KPIOS_CMD_MAGIC 0x49715385U +typedef struct kpios_cmd { + uint32_t cmd_magic; /* Unique magic */ + uint32_t cmd_id; /* Run ID */ + char cmd_pool[KPIOS_NAME_SIZE]; /* Pool name */ + uint64_t cmd_chunk_size; /* Chunk size */ + uint32_t cmd_thread_count; /* Thread count */ + uint32_t cmd_region_count; /* Region count */ + uint64_t cmd_region_size; /* Region size */ + uint64_t cmd_offset; /* Region offset */ + uint32_t cmd_region_noise; /* Region noise */ + uint32_t cmd_chunk_noise; /* Chunk noise */ + uint32_t cmd_thread_delay; /* Thread delay */ + uint32_t cmd_flags; /* Test flags */ + char cmd_pre[KPIOS_PATH_SIZE]; /* Pre-exec hook */ + char cmd_post[KPIOS_PATH_SIZE]; /* Post-exec hook */ + char cmd_log[KPIOS_PATH_SIZE]; /* Requested log dir */ + uint64_t cmd_data_size; /* Opaque data size */ + char cmd_data_str[0]; /* Opaque data region */ +} kpios_cmd_t; + +/* Valid ioctls */ +#define KPIOS_CFG _IOWR('f', 101, long) +#define KPIOS_CMD _IOWR('f', 102, long) + +/* Valid configuration commands */ +#define KPIOS_CFG_BUFFER_CLEAR 0x001 /* Clear text buffer */ +#define KPIOS_CFG_BUFFER_SIZE 0x002 /* Resize text buffer */ + +#endif /* _KPIOS_CTL_H */ diff --git a/zfs/lib/libzpios/include/kpios-internal.h b/zfs/lib/libzpios/include/kpios-internal.h new file mode 100644 index 0000000000..3da1fe3823 --- /dev/null +++ b/zfs/lib/libzpios/include/kpios-internal.h @@ -0,0 +1,137 @@ +/* + * This file is part of the ZFS Linux port. + * + * Copyright (c) 2008 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory + * Written by: + * Brian Behlendorf , + * Herb Wartens , + * Jim Garlick + * LLNL-CODE-403049 + * + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#ifndef _KPIOS_INTERNAL_H +#define _KPIOS_INTERNAL_H + +#include "kpios-ctl.h" + +#define OBJ_SIZE 64 + +struct run_args; + +typedef struct dmu_obj { + objset_t *os; + uint64_t obj; +} dmu_obj_t; + +/* thread doing the IO data */ +typedef struct thread_data { + struct run_args *run_args; + int thread_no; + int rc; + kpios_stats_t stats; + kmutex_t lock; +} thread_data_t; + +/* region for IO data */ +typedef struct kpios_region { + __u64 wr_offset; + __u64 rd_offset; + __u64 init_offset; + __u64 max_offset; + dmu_obj_t obj; + kpios_stats_t stats; + kmutex_t lock; +} kpios_region_t; + +/* arguments for one run */ +typedef struct run_args { + /* Config args */ + int id; + char pool[KPIOS_NAME_SIZE]; + __u64 chunk_size; + __u32 thread_count; + __u32 region_count; + __u64 region_size; + __u64 offset; + __u32 region_noise; + __u32 chunk_noise; + __u32 thread_delay; + __u32 flags; + char pre[KPIOS_PATH_SIZE]; + char post[KPIOS_PATH_SIZE]; + char log[KPIOS_PATH_SIZE]; + + /* Control data */ + objset_t *os; + wait_queue_head_t waitq; + volatile uint64_t threads_done; + kmutex_t lock_work; + kmutex_t lock_ctl; + __u32 region_next; + + /* Results data */ + struct file *file; + kpios_stats_t stats; + + thread_data_t **threads; + kpios_region_t regions[0]; /* Must be last element */ +} run_args_t; + +#define KPIOS_INFO_BUFFER_SIZE 65536 +#define KPIOS_INFO_BUFFER_REDZONE 1024 + +typedef struct kpios_info { + spinlock_t info_lock; + int info_size; + char *info_buffer; + char *info_head; /* Internal kernel use only */ +} kpios_info_t; + +#define kpios_print(file, format, args...) \ +({ kpios_info_t *_info_ = (kpios_info_t *)file->private_data; \ + int _rc_; \ + \ + ASSERT(_info_); \ + ASSERT(_info_->info_buffer); \ + \ + spin_lock(&_info_->info_lock); \ + \ + /* Don't allow the kernel to start a write in the red zone */ \ + if ((int)(_info_->info_head - _info_->info_buffer) > \ + (_info_->info_size - KPIOS_INFO_BUFFER_REDZONE)) { \ + _rc_ = -EOVERFLOW; \ + } else { \ + _rc_ = sprintf(_info_->info_head, format, args); \ + if (_rc_ >= 0) \ + _info_->info_head += _rc_; \ + } \ + \ + spin_unlock(&_info_->info_lock); \ + _rc_; \ +}) + +#define kpios_vprint(file, test, format, args...) \ + kpios_print(file, "%*s: " format, KPIOS_NAME_SIZE, test, args) + +#endif /* _KPIOS_INTERNAL_H */ diff --git a/zfs/lib/libzpios/kpios.c b/zfs/lib/libzpios/kpios.c new file mode 100644 index 0000000000..a6493d8d50 --- /dev/null +++ b/zfs/lib/libzpios/kpios.c @@ -0,0 +1,1295 @@ +/* + * This file is part of the ZFS Linux port. + * + * Copyright (c) 2008 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory + * Written by: + * Brian Behlendorf , + * Herb Wartens , + * Jim Garlick + * LLNL-CODE-403049 + * + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + * + * Kernel PIOS DMU implemenation originally derived from PIOS test code. + * Character control interface derived from SPL code. + */ + +#include +#include +#include +#include +#include "kpios-internal.h" + + +static struct class *kpios_class; + + +static inline +struct timespec timespec_add(struct timespec lhs, struct timespec rhs) +{ + struct timespec ts_delta; + set_normalized_timespec(&ts_delta, lhs.tv_sec + rhs.tv_sec, + lhs.tv_nsec + rhs.tv_nsec); + return ts_delta; +} + +static +int kpios_upcall(char *path, char *phase, run_args_t *run_args, int rc) +{ + /* This is stack heavy but it should be OK since we are only + * making the upcall between tests when the stack is shallow. + */ + char id[16], chunk_size[16], region_size[16], thread_count[16]; + char region_count[16], offset[16], region_noise[16], chunk_noise[16]; + char thread_delay[16], flags[16], result[8]; + char *argv[16], *envp[4]; + + if (strlen(path) == 0) + return -ENOENT; + + snprintf(id, 15, "%d", run_args->id); + snprintf(chunk_size, 15, "%lu", (long unsigned)run_args->chunk_size); + snprintf(region_size, 15, "%lu",(long unsigned) run_args->region_size); + snprintf(thread_count, 15, "%u", run_args->thread_count); + snprintf(region_count, 15, "%u", run_args->region_count); + snprintf(offset, 15, "%lu", (long unsigned)run_args->offset); + snprintf(region_noise, 15, "%u", run_args->region_noise); + snprintf(chunk_noise, 15, "%u", run_args->chunk_noise); + snprintf(thread_delay, 15, "%u", run_args->thread_delay); + snprintf(flags, 15, "0x%x", run_args->flags); + snprintf(result, 7, "%d", rc); + + /* Passing 15 args to registered pre/post upcall */ + argv[0] = path; + argv[1] = phase; + argv[2] = strlen(run_args->log) ? run_args->log : ""; + argv[3] = id; + argv[4] = run_args->pool; + argv[5] = chunk_size; + argv[6] = region_size; + argv[7] = thread_count; + argv[8] = region_count; + argv[9] = offset; + argv[10] = region_noise; + argv[11] = chunk_noise; + argv[12] = thread_delay; + argv[13] = flags; + argv[14] = result; + argv[15] = NULL; + + /* Passing environment for userspace upcall */ + envp[0] = "HOME=/"; + envp[1] = "TERM=linux"; + envp[2] = "PATH=/sbin:/usr/sbin:/bin:/usr/bin"; + envp[3] = NULL; + + return call_usermodehelper(path, argv, envp, 1); +} + +static uint64_t +kpios_dmu_object_create(run_args_t *run_args, objset_t *os) +{ + struct dmu_tx *tx; + uint64_t obj = 0ULL; + int rc; + + tx = dmu_tx_create(os); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, OBJ_SIZE); + rc = dmu_tx_assign(tx, TXG_WAIT); + if (rc) { + kpios_print(run_args->file, + "dmu_tx_assign() failed: %d\n", rc); + dmu_tx_abort(tx); + return obj; + } + + obj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0, + DMU_OT_NONE, 0, tx); + rc = dmu_object_set_blocksize(os, obj, 128ULL << 10, 0, tx); + if (rc) { + kpios_print(run_args->file, + "dmu_object_set_blocksize() failed: %d\n", rc); + dmu_tx_abort(tx); + return obj; + } + + dmu_tx_commit(tx); + + return obj; +} + +static int +kpios_dmu_object_free(run_args_t *run_args, objset_t *os, uint64_t obj) +{ + struct dmu_tx *tx; + int rc; + + tx = dmu_tx_create(os); + dmu_tx_hold_free(tx, obj, 0, DMU_OBJECT_END); + rc = dmu_tx_assign(tx, TXG_WAIT); + if (rc) { + kpios_print(run_args->file, + "dmu_tx_assign() failed: %d\n", rc); + dmu_tx_abort(tx); + return rc; + } + + rc = dmu_object_free(os, obj, tx); + if (rc) { + kpios_print(run_args->file, + "dmu_object_free() failed: %d\n", rc); + dmu_tx_abort(tx); + return rc; + } + + dmu_tx_commit(tx); + + return 0; +} + +static int +kpios_dmu_setup(run_args_t *run_args) +{ + kpios_time_t *t = &(run_args->stats.cr_time); + objset_t *os; + uint64_t obj = 0ULL; + int i, rc = 0; + + t->start = current_kernel_time(); + + rc = dmu_objset_open(run_args->pool, DMU_OST_ZFS, DS_MODE_STANDARD, &os); + if (rc) { + kpios_print(run_args->file, "Error dmu_objset_open() " + "failed: %d\n", rc); + goto out; + } + + if (!(run_args->flags & DMU_FPP)) { + obj = kpios_dmu_object_create(run_args, os); + if (obj == 0) { + rc = -EBADF; + kpios_print(run_args->file, "Error kpios_dmu_" + "object_create() failed, %d\n", rc); + goto out; + } + } + + for (i = 0; i < run_args->region_count; i++) { + kpios_region_t *region; + + region = &run_args->regions[i]; + mutex_init(®ion->lock, NULL, MUTEX_DEFAULT, NULL); + + if (run_args->flags & DMU_FPP) { + /* File per process */ + region->obj.os = os; + region->obj.obj = kpios_dmu_object_create(run_args, os); + ASSERT(region->obj.obj > 0); /* XXX - Handle this */ + region->wr_offset = run_args->offset; + region->rd_offset = run_args->offset; + region->init_offset = run_args->offset; + region->max_offset = run_args->offset + + run_args->region_size; + } else { + /* Single shared file */ + region->obj.os = os; + region->obj.obj = obj; + region->wr_offset = run_args->offset * i; + region->rd_offset = run_args->offset * i; + region->init_offset = run_args->offset * i; + region->max_offset = run_args->offset * + i + run_args->region_size; + } + } + + run_args->os = os; +out: + t->stop = current_kernel_time(); + t->delta = timespec_sub(t->stop, t->start); + + return rc; +} + +static int +kpios_setup_run(run_args_t **run_args, kpios_cmd_t *kcmd, struct file *file) +{ + run_args_t *ra; + int rc, size; + + size = sizeof(*ra) + kcmd->cmd_region_count * sizeof(kpios_region_t); + + ra = vmem_zalloc(size, KM_SLEEP); + if (ra == NULL) { + kpios_print(file, "Unable to vmem_zalloc() %d bytes " + "for regions\n", size); + return -ENOMEM; + } + + *run_args = ra; + strncpy(ra->pool, kcmd->cmd_pool, KPIOS_NAME_SIZE - 1); + strncpy(ra->pre, kcmd->cmd_pre, KPIOS_PATH_SIZE - 1); + strncpy(ra->post, kcmd->cmd_post, KPIOS_PATH_SIZE - 1); + strncpy(ra->log, kcmd->cmd_log, KPIOS_PATH_SIZE - 1); + ra->id = kcmd->cmd_id; + ra->chunk_size = kcmd->cmd_chunk_size; + ra->thread_count = kcmd->cmd_thread_count; + ra->region_count = kcmd->cmd_region_count; + ra->region_size = kcmd->cmd_region_size; + ra->offset = kcmd->cmd_offset; + ra->region_noise = kcmd->cmd_region_noise; + ra->chunk_noise = kcmd->cmd_chunk_noise; + ra->thread_delay = kcmd->cmd_thread_delay; + ra->flags = kcmd->cmd_flags; + ra->stats.wr_data = 0; + ra->stats.wr_chunks = 0; + ra->stats.rd_data = 0; + ra->stats.rd_chunks = 0; + ra->region_next = 0; + ra->file = file; + mutex_init(&ra->lock_work, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&ra->lock_ctl, NULL, MUTEX_DEFAULT, NULL); + + rc = kpios_dmu_setup(ra); + if (rc) { + mutex_destroy(&ra->lock_ctl); + mutex_destroy(&ra->lock_work); + vmem_free(ra, size); + *run_args = NULL; + } + + return rc; +} + +static int +kpios_get_work_item(run_args_t *run_args, dmu_obj_t *obj, __u64 *offset, + __u32 *chunk_size, kpios_region_t **region, __u32 flags) +{ + int i, j, count = 0; + unsigned int random_int; + + get_random_bytes(&random_int, sizeof(unsigned int)); + + mutex_enter(&run_args->lock_work); + i = run_args->region_next; + + /* XXX: I don't much care for this chunk selection mechansim + * there's the potential to burn a lot of time here doing nothing + * useful while holding the global lock. This could give some + * misleading performance results. I'll fix it latter. + */ + while (count < run_args->region_count) { + __u64 *rw_offset; + kpios_time_t *rw_time; + + j = i % run_args->region_count; + *region = &(run_args->regions[j]); + + if (flags & DMU_WRITE) { + rw_offset = &((*region)->wr_offset); + rw_time = &((*region)->stats.wr_time); + } else { + rw_offset = &((*region)->rd_offset); + rw_time = &((*region)->stats.rd_time); + } + + /* test if region is fully written */ + if (*rw_offset + *chunk_size > (*region)->max_offset) { + i++; + count++; + + if (unlikely(rw_time->stop.tv_sec == 0) && + unlikely(rw_time->stop.tv_nsec == 0)) + rw_time->stop = current_kernel_time(); + + continue; + } + + *offset = *rw_offset; + *obj = (*region)->obj; + *rw_offset += *chunk_size; + + /* update ctl structure */ + if (run_args->region_noise) { + get_random_bytes(&random_int, sizeof(unsigned int)); + run_args->region_next += random_int % run_args->region_noise; + } else { + run_args->region_next++; + } + + mutex_exit(&run_args->lock_work); + return 1; + } + + /* nothing left to do */ + mutex_exit(&run_args->lock_work); + + return 0; +} + +static void +kpios_remove_objects(run_args_t *run_args) +{ + kpios_time_t *t = &(run_args->stats.rm_time); + kpios_region_t *region; + int rc = 0, i; + + t->start = current_kernel_time(); + + if (run_args->flags & DMU_REMOVE) { + if (run_args->flags & DMU_FPP) { + for (i = 0; i < run_args->region_count; i++) { + region = &run_args->regions[i]; + rc = kpios_dmu_object_free(run_args, + region->obj.os, + region->obj.obj); + if (rc) + kpios_print(run_args->file, "Error " + "removing object %d, %d\n", + (int)region->obj.obj, rc); + } + } else { + region = &run_args->regions[0]; + rc = kpios_dmu_object_free(run_args, + region->obj.os, + region->obj.obj); + if (rc) + kpios_print(run_args->file, "Error " + "removing object %d, %d\n", + (int)region->obj.obj, rc); + } + } + + dmu_objset_close(run_args->os); + + t->stop = current_kernel_time(); + t->delta = timespec_sub(t->stop, t->start); +} + +static void +kpios_cleanup_run(run_args_t *run_args) +{ + int i, size = 0; + + if (run_args == NULL) + return; + + if (run_args->threads != NULL) { + for (i = 0; i < run_args->thread_count; i++) { + if (run_args->threads[i]) { + mutex_destroy(&run_args->threads[i]->lock); + kmem_free(run_args->threads[i], + sizeof(thread_data_t)); + } + } + + kmem_free(run_args->threads, + sizeof(thread_data_t *) * run_args->thread_count); + } + + if (run_args->regions != NULL) + for (i = 0; i < run_args->region_count; i++) + mutex_destroy(&run_args->regions[i].lock); + + mutex_destroy(&run_args->lock_work); + mutex_destroy(&run_args->lock_ctl); + + if (run_args->regions != NULL) + size = run_args->region_count * sizeof(kpios_region_t); + + vmem_free(run_args, sizeof(*run_args) + size); +} + +static int +kpios_dmu_write(run_args_t *run_args, objset_t *os, uint64_t object, + uint64_t offset, uint64_t size, const void *buf) +{ + struct dmu_tx *tx; + int rc, how = TXG_WAIT; + int flags = 0; + + while (1) { + tx = dmu_tx_create(os); + dmu_tx_hold_write(tx, object, offset, size); + rc = dmu_tx_assign(tx, how); + + if (rc) { + if (rc == ERESTART && how == TXG_NOWAIT) { + dmu_tx_wait(tx); + dmu_tx_abort(tx); + continue; + } + kpios_print(run_args->file, + "Error in dmu_tx_assign(), %d", rc); + dmu_tx_abort(tx); + return rc; + } + break; + } + + if (run_args->flags & DMU_WRITE_ZC) + flags |= DMU_WRITE_ZEROCOPY; + + dmu_write_impl(os, object, offset, size, buf, tx, flags); + dmu_tx_commit(tx); + + return 0; +} + +static int +kpios_dmu_read(run_args_t *run_args, objset_t *os, uint64_t object, + uint64_t offset, uint64_t size, void *buf) +{ + int flags = 0; + + if (run_args->flags & DMU_READ_ZC) + flags |= DMU_READ_ZEROCOPY; + + return dmu_read_impl(os, object, offset, size, buf, flags); +} + +static int +kpios_thread_main(void *data) +{ + thread_data_t *thr = (thread_data_t *)data; + run_args_t *run_args = thr->run_args; + kpios_time_t t; + + dmu_obj_t obj; + __u64 offset; + __u32 chunk_size; + kpios_region_t *region; + char *buf; + + unsigned int random_int; + int chunk_noise = run_args->chunk_noise; + int chunk_noise_tmp = 0; + int thread_delay = run_args->thread_delay; + int thread_delay_tmp = 0; + + int i, rc = 0; + + if (chunk_noise) { + get_random_bytes(&random_int, sizeof(unsigned int)); + chunk_noise_tmp = (random_int % (chunk_noise * 2)) - chunk_noise; + } + + /* It's OK to vmem_alloc() this memory because it will be copied + * in to the slab and pointers to the slab copy will be setup in + * the bio when the IO is submitted. This of course is not ideal + * since we want a zero-copy IO path if possible. It would be nice + * to have direct access to those slab entries. + */ + chunk_size = run_args->chunk_size + chunk_noise_tmp; + buf = (char *)vmem_alloc(chunk_size, KM_SLEEP); + ASSERT(buf); + + /* Trivial data verification pattern for now. */ + if (run_args->flags & DMU_VERIFY) + memset(buf, 'z', chunk_size); + + /* Write phase */ + mutex_enter(&thr->lock); + thr->stats.wr_time.start = current_kernel_time(); + mutex_exit(&thr->lock); + + while (kpios_get_work_item(run_args, &obj, &offset, + &chunk_size, ®ion, DMU_WRITE)) { + if (thread_delay) { + get_random_bytes(&random_int, sizeof(unsigned int)); + thread_delay_tmp = random_int % thread_delay; + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(thread_delay_tmp); /* In jiffies */ + } + + t.start = current_kernel_time(); + rc = kpios_dmu_write(run_args, obj.os, obj.obj, + offset, chunk_size, buf); + t.stop = current_kernel_time(); + t.delta = timespec_sub(t.stop, t.start); + + if (rc) { + kpios_print(run_args->file, "IO error while doing " + "dmu_write(): %d\n", rc); + break; + } + + mutex_enter(&thr->lock); + thr->stats.wr_data += chunk_size; + thr->stats.wr_chunks++; + thr->stats.wr_time.delta = timespec_add( + thr->stats.wr_time.delta, t.delta); + mutex_exit(&thr->lock); + + mutex_enter(®ion->lock); + region->stats.wr_data += chunk_size; + region->stats.wr_chunks++; + region->stats.wr_time.delta = timespec_add( + region->stats.wr_time.delta, t.delta); + + /* First time region was accessed */ + if (region->init_offset == offset) + region->stats.wr_time.start = t.start; + + mutex_exit(®ion->lock); + } + + mutex_enter(&run_args->lock_ctl); + run_args->threads_done++; + mutex_exit(&run_args->lock_ctl); + + mutex_enter(&thr->lock); + thr->rc = rc; + thr->stats.wr_time.stop = current_kernel_time(); + mutex_exit(&thr->lock); + wake_up(&run_args->waitq); + + set_current_state(TASK_UNINTERRUPTIBLE); + schedule(); + + /* Check if we should exit */ + mutex_enter(&thr->lock); + rc = thr->rc; + mutex_exit(&thr->lock); + if (rc) + goto out; + + /* Read phase */ + mutex_enter(&thr->lock); + thr->stats.rd_time.start = current_kernel_time(); + mutex_exit(&thr->lock); + + while (kpios_get_work_item(run_args, &obj, &offset, + &chunk_size, ®ion, DMU_READ)) { + if (thread_delay) { + get_random_bytes(&random_int, sizeof(unsigned int)); + thread_delay_tmp = random_int % thread_delay; + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(thread_delay_tmp); /* In jiffies */ + } + + if (run_args->flags & DMU_VERIFY) + memset(buf, 0, chunk_size); + + t.start = current_kernel_time(); + rc = kpios_dmu_read(run_args, obj.os, obj.obj, + offset, chunk_size, buf); + t.stop = current_kernel_time(); + t.delta = timespec_sub(t.stop, t.start); + + if (rc) { + kpios_print(run_args->file, "IO error while doing " + "dmu_read(): %d\n", rc); + break; + } + + /* Trivial data verification, expensive! */ + if (run_args->flags & DMU_VERIFY) { + for (i = 0; i < chunk_size; i++) { + if (buf[i] != 'z') { + kpios_print(run_args->file, + "IO verify error: %d/%d/%d\n", + (int)obj.obj, (int)offset, + (int)chunk_size); + break; + } + } + } + + mutex_enter(&thr->lock); + thr->stats.rd_data += chunk_size; + thr->stats.rd_chunks++; + thr->stats.rd_time.delta = timespec_add( + thr->stats.rd_time.delta, t.delta); + mutex_exit(&thr->lock); + + mutex_enter(®ion->lock); + region->stats.rd_data += chunk_size; + region->stats.rd_chunks++; + region->stats.rd_time.delta = timespec_add( + region->stats.rd_time.delta, t.delta); + + /* First time region was accessed */ + if (region->init_offset == offset) + region->stats.rd_time.start = t.start; + + mutex_exit(®ion->lock); + } + + mutex_enter(&run_args->lock_ctl); + run_args->threads_done++; + mutex_exit(&run_args->lock_ctl); + + mutex_enter(&thr->lock); + thr->rc = rc; + thr->stats.rd_time.stop = current_kernel_time(); + mutex_exit(&thr->lock); + wake_up(&run_args->waitq); + +out: + vmem_free(buf, chunk_size); + do_exit(0); + + return rc; /* Unreachable, due to do_exit() */ +} + +static int +kpios_thread_done(run_args_t *run_args) +{ + ASSERT(run_args->threads_done <= run_args->thread_count); + return (run_args->threads_done == run_args->thread_count); +} + +static int +kpios_threads_run(run_args_t *run_args) +{ + struct task_struct *tsk, **tsks; + thread_data_t *thr = NULL; + kpios_time_t *tt = &(run_args->stats.total_time); + kpios_time_t *tw = &(run_args->stats.wr_time); + kpios_time_t *tr = &(run_args->stats.rd_time); + int i, rc = 0, tc = run_args->thread_count; + DEFINE_WAIT(wait); + + kpios_upcall(run_args->pre, PHASE_PRE, run_args, 0); + + tsks = kmem_zalloc(sizeof(struct task_struct *) * tc, KM_SLEEP); + if (tsks == NULL) { + rc = -ENOMEM; + goto cleanup2; + } + + run_args->threads = kmem_zalloc(sizeof(thread_data_t *) * tc, KM_SLEEP); + if (run_args->threads == NULL) { + rc = -ENOMEM; + goto cleanup; + } + + init_waitqueue_head(&run_args->waitq); + run_args->threads_done = 0; + + /* Create all the needed threads which will sleep until awoken */ + for (i = 0; i < tc; i++) { + thr = kmem_zalloc(sizeof(thread_data_t), KM_SLEEP); + if (thr == NULL) { + rc = -ENOMEM; + goto taskerr; + } + + thr->thread_no = i; + thr->run_args = run_args; + thr->rc = 0; + mutex_init(&thr->lock, NULL, MUTEX_DEFAULT, NULL); + run_args->threads[i] = thr; + + tsk = kthread_create(kpios_thread_main, (void *)thr, + "%s/%d", "kpios_io", i); + if (IS_ERR(tsk)) { + rc = -EINVAL; + goto taskerr; + } + + tsks[i] = tsk; + } + + tt->start = current_kernel_time(); + + /* Wake up all threads for write phase */ + kpios_upcall(run_args->pre, PHASE_WRITE, run_args, 0); + for (i = 0; i < tc; i++) + wake_up_process(tsks[i]); + + /* Wait for write phase to complete */ + tw->start = current_kernel_time(); + wait_event(run_args->waitq, kpios_thread_done(run_args)); + tw->stop = current_kernel_time(); + + for (i = 0; i < tc; i++) { + thr = run_args->threads[i]; + + mutex_enter(&thr->lock); + + if (!rc && thr->rc) + rc = thr->rc; + + run_args->stats.wr_data += thr->stats.wr_data; + run_args->stats.wr_chunks += thr->stats.wr_chunks; + mutex_exit(&thr->lock); + } + + kpios_upcall(run_args->post, PHASE_WRITE, run_args, rc); + if (rc) { + /* Wake up all threads and tell them to exit */ + for (i = 0; i < tc; i++) { + mutex_enter(&thr->lock); + thr->rc = rc; + mutex_exit(&thr->lock); + + wake_up_process(tsks[i]); + } + goto out; + } + + mutex_enter(&run_args->lock_ctl); + ASSERT(run_args->threads_done == run_args->thread_count); + run_args->threads_done = 0; + mutex_exit(&run_args->lock_ctl); + + /* Wake up all threads for read phase */ + kpios_upcall(run_args->pre, PHASE_READ, run_args, 0); + for (i = 0; i < tc; i++) + wake_up_process(tsks[i]); + + /* Wait for read phase to complete */ + tr->start = current_kernel_time(); + wait_event(run_args->waitq, kpios_thread_done(run_args)); + tr->stop = current_kernel_time(); + + for (i = 0; i < tc; i++) { + thr = run_args->threads[i]; + + mutex_enter(&thr->lock); + + if (!rc && thr->rc) + rc = thr->rc; + + run_args->stats.rd_data += thr->stats.rd_data; + run_args->stats.rd_chunks += thr->stats.rd_chunks; + mutex_exit(&thr->lock); + } + + kpios_upcall(run_args->post, PHASE_READ, run_args, rc); +out: + tt->stop = current_kernel_time(); + tt->delta = timespec_sub(tt->stop, tt->start); + tw->delta = timespec_sub(tw->stop, tw->start); + tr->delta = timespec_sub(tr->stop, tr->start); + +cleanup: + kmem_free(tsks, sizeof(struct task_struct *) * tc); +cleanup2: + kpios_upcall(run_args->post, PHASE_POST, run_args, rc); + + /* Returns first encountered thread error (if any) */ + return rc; + +taskerr: + /* Destroy all threads that were created successfully */ + for (i = 0; i < tc; i++) + if (tsks[i] != NULL) + (void) kthread_stop(tsks[i]); + + goto cleanup; +} + +static int +kpios_do_one_run(struct file *file, kpios_cmd_t *kcmd, + int data_size, void *data) +{ + run_args_t *run_args; + kpios_stats_t *stats = (kpios_stats_t *)data; + int i, n, m, size, rc; + + if ((!kcmd->cmd_chunk_size) || (!kcmd->cmd_region_size) || + (!kcmd->cmd_thread_count) || (!kcmd->cmd_region_count)) { + kpios_print(file, "Invalid chunk_size, region_size, " + "thread_count, or region_count, %d\n", -EINVAL); + return -EINVAL; + } + + if (!(kcmd->cmd_flags & DMU_WRITE) || + !(kcmd->cmd_flags & DMU_READ)) { + kpios_print(file, "Invalid flags, minimally DMU_WRITE " + "and DMU_READ must be set, %d\n", -EINVAL); + return -EINVAL; + } + + if ((kcmd->cmd_flags & (DMU_WRITE_ZC | DMU_READ_ZC)) && + (kcmd->cmd_flags & DMU_VERIFY)) { + kpios_print(file, "Invalid flags, DMU_*_ZC incompatible " + "with DMU_VERIFY, used for performance analysis " + "only, %d\n", -EINVAL); + return -EINVAL; + } + + /* Opaque data on return contains structs of the following form: + * + * kpios_stat_t stats[]; + * stats[0] = run_args->stats; + * stats[1-N] = threads[N]->stats; + * stats[N+1-M] = regions[M]->stats; + * + * Where N is the number of threads, and M is the number of regions. + */ + size = (sizeof(kpios_stats_t) + + (kcmd->cmd_thread_count * sizeof(kpios_stats_t)) + + (kcmd->cmd_region_count * sizeof(kpios_stats_t))); + if (data_size < size) { + kpios_print(file, "Invalid size, command data buffer " + "size too small, (%d < %d)\n", data_size, size); + return -ENOSPC; + } + + rc = kpios_setup_run(&run_args, kcmd, file); + if (rc) + return rc; + + rc = kpios_threads_run(run_args); + kpios_remove_objects(run_args); + if (rc) + goto cleanup; + + if (stats) { + n = 1; + m = 1 + kcmd->cmd_thread_count; + stats[0] = run_args->stats; + + for (i = 0; i < kcmd->cmd_thread_count; i++) + stats[n+i] = run_args->threads[i]->stats; + + for (i = 0; i < kcmd->cmd_region_count; i++) + stats[m+i] = run_args->regions[i].stats; + } + +cleanup: + kpios_cleanup_run(run_args); + return rc; +} + +static int +kpios_open(struct inode *inode, struct file *file) +{ + unsigned int minor = iminor(inode); + kpios_info_t *info; + + if (minor >= KPIOS_MINORS) + return -ENXIO; + + info = (kpios_info_t *)kmem_alloc(sizeof(*info), KM_SLEEP); + if (info == NULL) + return -ENOMEM; + + spin_lock_init(&info->info_lock); + info->info_size = KPIOS_INFO_BUFFER_SIZE; + info->info_buffer = (char *)vmem_alloc(KPIOS_INFO_BUFFER_SIZE,KM_SLEEP); + if (info->info_buffer == NULL) { + kmem_free(info, sizeof(*info)); + return -ENOMEM; + } + + info->info_head = info->info_buffer; + file->private_data = (void *)info; + + return 0; +} + +static int +kpios_release(struct inode *inode, struct file *file) +{ + unsigned int minor = iminor(inode); + kpios_info_t *info = (kpios_info_t *)file->private_data; + + if (minor >= KPIOS_MINORS) + return -ENXIO; + + ASSERT(info); + ASSERT(info->info_buffer); + + vmem_free(info->info_buffer, KPIOS_INFO_BUFFER_SIZE); + kmem_free(info, sizeof(*info)); + + return 0; +} + +static int +kpios_buffer_clear(struct file *file, kpios_cfg_t *kcfg, unsigned long arg) +{ + kpios_info_t *info = (kpios_info_t *)file->private_data; + + ASSERT(info); + ASSERT(info->info_buffer); + + spin_lock(&info->info_lock); + memset(info->info_buffer, 0, info->info_size); + info->info_head = info->info_buffer; + spin_unlock(&info->info_lock); + + return 0; +} + +static int +kpios_buffer_size(struct file *file, kpios_cfg_t *kcfg, unsigned long arg) +{ + kpios_info_t *info = (kpios_info_t *)file->private_data; + char *buf; + int min, size, rc = 0; + + ASSERT(info); + ASSERT(info->info_buffer); + + spin_lock(&info->info_lock); + if (kcfg->cfg_arg1 > 0) { + + size = kcfg->cfg_arg1; + buf = (char *)vmem_alloc(size, KM_SLEEP); + if (buf == NULL) { + rc = -ENOMEM; + goto out; + } + + /* Zero fill and truncate contents when coping buffer */ + min = ((size < info->info_size) ? size : info->info_size); + memset(buf, 0, size); + memcpy(buf, info->info_buffer, min); + vmem_free(info->info_buffer, info->info_size); + info->info_size = size; + info->info_buffer = buf; + info->info_head = info->info_buffer; + } + + kcfg->cfg_rc1 = info->info_size; + + if (copy_to_user((struct kpios_cfg_t __user *)arg, kcfg, sizeof(*kcfg))) + rc = -EFAULT; +out: + spin_unlock(&info->info_lock); + + return rc; +} + +static int +kpios_ioctl_cfg(struct file *file, unsigned long arg) +{ + kpios_cfg_t kcfg; + int rc = 0; + + if (copy_from_user(&kcfg, (kpios_cfg_t *)arg, sizeof(kcfg))) + return -EFAULT; + + if (kcfg.cfg_magic != KPIOS_CFG_MAGIC) { + kpios_print(file, "Bad config magic 0x%x != 0x%x\n", + kcfg.cfg_magic, KPIOS_CFG_MAGIC); + return -EINVAL; + } + + switch (kcfg.cfg_cmd) { + case KPIOS_CFG_BUFFER_CLEAR: + /* cfg_arg1 - Unused + * cfg_rc1 - Unused + */ + rc = kpios_buffer_clear(file, &kcfg, arg); + break; + case KPIOS_CFG_BUFFER_SIZE: + /* cfg_arg1 - 0 - query size; >0 resize + * cfg_rc1 - Set to current buffer size + */ + rc = kpios_buffer_size(file, &kcfg, arg); + break; + default: + kpios_print(file, "Bad config command %d\n", + kcfg.cfg_cmd); + rc = -EINVAL; + break; + } + + return rc; +} + +static int +kpios_ioctl_cmd(struct file *file, unsigned long arg) +{ + kpios_cmd_t kcmd; + int rc = -EINVAL; + void *data = NULL; + + rc = copy_from_user(&kcmd, (kpios_cfg_t *)arg, sizeof(kcmd)); + if (rc) { + kpios_print(file, "Unable to copy command structure " + "from user to kernel memory, %d\n", rc); + return -EFAULT; + } + + if (kcmd.cmd_magic != KPIOS_CMD_MAGIC) { + kpios_print(file, "Bad command magic 0x%x != 0x%x\n", + kcmd.cmd_magic, KPIOS_CFG_MAGIC); + return -EINVAL; + } + + /* Allocate memory for any opaque data the caller needed to pass on */ + if (kcmd.cmd_data_size > 0) { + data = (void *)vmem_alloc(kcmd.cmd_data_size, KM_SLEEP); + if (data == NULL) { + kpios_print(file, "Unable to vmem_alloc() %ld " + "bytes for data buffer\n", + (long)kcmd.cmd_data_size); + return -ENOMEM; + } + + rc = copy_from_user(data, (void *)(arg + offsetof(kpios_cmd_t, + cmd_data_str)), kcmd.cmd_data_size); + if (rc) { + kpios_print(file, "Unable to copy data buffer " + "from user to kernel memory, %d\n", rc); + vmem_free(data, kcmd.cmd_data_size); + return -EFAULT; + } + } + + rc = kpios_do_one_run(file, &kcmd, kcmd.cmd_data_size, data); + + if (data != NULL) { + /* If the test failed do not print out the stats */ + if (rc) + goto cleanup; + + rc = copy_to_user((void *)(arg + offsetof(kpios_cmd_t, + cmd_data_str)), data, kcmd.cmd_data_size); + if (rc) { + kpios_print(file, "Unable to copy data buffer " + "from kernel to user memory, %d\n", rc); + rc = -EFAULT; + } + +cleanup: + vmem_free(data, kcmd.cmd_data_size); + } + + return rc; +} + +static int +kpios_ioctl(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg) +{ + unsigned int minor = iminor(file->f_dentry->d_inode); + int rc = 0; + + /* Ignore tty ioctls */ + if ((cmd & 0xffffff00) == ((int)'T') << 8) + return -ENOTTY; + + if (minor >= KPIOS_MINORS) + return -ENXIO; + + switch (cmd) { + case KPIOS_CFG: + rc = kpios_ioctl_cfg(file, arg); + break; + case KPIOS_CMD: + rc = kpios_ioctl_cmd(file, arg); + break; + default: + kpios_print(file, "Bad ioctl command %d\n", cmd); + rc = -EINVAL; + break; + } + + return rc; +} + +/* I'm not sure why you would want to write in to this buffer from + * user space since its principle use is to pass test status info + * back to the user space, but I don't see any reason to prevent it. + */ +static ssize_t +kpios_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + unsigned int minor = iminor(file->f_dentry->d_inode); + kpios_info_t *info = (kpios_info_t *)file->private_data; + int rc = 0; + + if (minor >= KPIOS_MINORS) + return -ENXIO; + + ASSERT(info); + ASSERT(info->info_buffer); + + spin_lock(&info->info_lock); + + /* Write beyond EOF */ + if (*ppos >= info->info_size) { + rc = -EFBIG; + goto out; + } + + /* Resize count if beyond EOF */ + if (*ppos + count > info->info_size) + count = info->info_size - *ppos; + + if (copy_from_user(info->info_buffer, buf, count)) { + rc = -EFAULT; + goto out; + } + + *ppos += count; + rc = count; +out: + spin_unlock(&info->info_lock); + return rc; +} + +static ssize_t +kpios_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + unsigned int minor = iminor(file->f_dentry->d_inode); + kpios_info_t *info = (kpios_info_t *)file->private_data; + int rc = 0; + + if (minor >= KPIOS_MINORS) + return -ENXIO; + + ASSERT(info); + ASSERT(info->info_buffer); + + spin_lock(&info->info_lock); + + /* Read beyond EOF */ + if (*ppos >= info->info_size) + goto out; + + /* Resize count if beyond EOF */ + if (*ppos + count > info->info_size) + count = info->info_size - *ppos; + + if (copy_to_user(buf, info->info_buffer + *ppos, count)) { + rc = -EFAULT; + goto out; + } + + *ppos += count; + rc = count; +out: + spin_unlock(&info->info_lock); + return rc; +} + +static loff_t kpios_seek(struct file *file, loff_t offset, int origin) +{ + unsigned int minor = iminor(file->f_dentry->d_inode); + kpios_info_t *info = (kpios_info_t *)file->private_data; + int rc = -EINVAL; + + if (minor >= KPIOS_MINORS) + return -ENXIO; + + ASSERT(info); + ASSERT(info->info_buffer); + + spin_lock(&info->info_lock); + + switch (origin) { + case 0: /* SEEK_SET - No-op just do it */ + break; + case 1: /* SEEK_CUR - Seek from current */ + offset = file->f_pos + offset; + break; + case 2: /* SEEK_END - Seek from end */ + offset = info->info_size + offset; + break; + } + + if (offset >= 0) { + file->f_pos = offset; + file->f_version = 0; + rc = offset; + } + + spin_unlock(&info->info_lock); + + return rc; +} + +static struct file_operations kpios_fops = { + .owner = THIS_MODULE, + .open = kpios_open, + .release = kpios_release, + .ioctl = kpios_ioctl, + .read = kpios_read, + .write = kpios_write, + .llseek = kpios_seek, +}; + +static struct cdev kpios_cdev = { + .owner = THIS_MODULE, + .kobj = { .name = "kpios", }, +}; + +static int __init +kpios_init(void) +{ + dev_t dev; + int rc; + + dev = MKDEV(KPIOS_MAJOR, 0); + if ((rc = register_chrdev_region(dev, KPIOS_MINORS, "kpios"))) + goto error; + + /* Support for registering a character driver */ + cdev_init(&kpios_cdev, &kpios_fops); + if ((rc = cdev_add(&kpios_cdev, dev, KPIOS_MINORS))) { + printk(KERN_ERR "kpios: Error adding cdev, %d\n", rc); + kobject_put(&kpios_cdev.kobj); + unregister_chrdev_region(dev, KPIOS_MINORS); + goto error; + } + + /* Support for udev make driver info available in sysfs */ + kpios_class = class_create(THIS_MODULE, "kpios"); + if (IS_ERR(kpios_class)) { + rc = PTR_ERR(kpios_class); + printk(KERN_ERR "kpios: Error creating kpios class, %d\n", rc); + cdev_del(&kpios_cdev); + unregister_chrdev_region(dev, KPIOS_MINORS); + goto error; + } + + class_device_create(kpios_class, NULL, dev, NULL, "kpios"); + + printk(KERN_INFO "kpios: Loaded Kernel PIOS Tests v%s\n", VERSION); + return 0; +error: + printk(KERN_ERR "kpios: Error registering kpios device, %d\n", rc); + return rc; +} + +static void +kpios_fini(void) +{ + dev_t dev = MKDEV(KPIOS_MAJOR, 0); + + class_device_destroy(kpios_class, dev); + class_destroy(kpios_class); + + cdev_del(&kpios_cdev); + unregister_chrdev_region(dev, KPIOS_MINORS); + + printk(KERN_INFO "kpios: Unloaded Kernel PIOS Tests\n"); + return; +} + +module_init(kpios_init); +module_exit(kpios_fini); + +MODULE_AUTHOR("LLNL / Sun"); +MODULE_DESCRIPTION("Kernel PIOS implementation"); +MODULE_LICENSE("GPL"); diff --git a/zfs/zcmd/Makefile.in b/zfs/zcmd/Makefile.in index 23720a5c77..276e3f73af 100644 --- a/zfs/zcmd/Makefile.in +++ b/zfs/zcmd/Makefile.in @@ -4,3 +4,4 @@ subdir-m += zpool #subdir-m += ztest #subdir-m += zdump #subdir-m += zinject +subdir-m += zpios diff --git a/zfs/zcmd/zpios/Makefile.in b/zfs/zcmd/zpios/Makefile.in new file mode 100644 index 0000000000..3d4421bb7d --- /dev/null +++ b/zfs/zcmd/zpios/Makefile.in @@ -0,0 +1,13 @@ +DISTFILES = Makefile.in *.c zpios.h + +CMD := zpios + +HOSTCFLAGS += @HOSTCFLAGS@ +HOSTCFLAGS += -I@CMDDIR@/lzpios +HOSTCFLAGS += -I@LIBDIR@/libzpios/include + +hostprogs-y := ${CMD} +always := $(hostprogs-y) + +${CMD}-objs += zpios_main.o +${CMD}-objs += zpios_util.o diff --git a/zfs/zcmd/zpios/zpios.h b/zfs/zcmd/zpios/zpios.h new file mode 100644 index 0000000000..01c2a9b6c6 --- /dev/null +++ b/zfs/zcmd/zpios/zpios.h @@ -0,0 +1,117 @@ +/* + * This file is part of the ZFS Linux port. + * + * Copyright (c) 2008 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory + * Written by: + * Brian Behlendorf , + * Herb Wartens , + * Jim Garlick + * LLNL-CODE-403049 + * + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#ifndef _ZPIOS_H +#define _ZPIOS_H + +#include + +#define VERSION_SIZE 64 + +/* Regular expressions */ +#define REGEX_NUMBERS "^[0-9]*[0-9]$" +#define REGEX_NUMBERS_COMMA "^([0-9]+,)*[0-9]+$" +#define REGEX_SIZE "^[0-9][0-9]*[kmgt]$" +#define REGEX_SIZE_COMMA "^([0-9][0-9]*[kmgt]+,)*[0-9][0-9]*[kmgt]$" + +/* Flags for low, high, incr */ +#define FLAG_SET 0x01 +#define FLAG_LOW 0x02 +#define FLAG_HIGH 0x04 +#define FLAG_INCR 0x08 + +#define TRUE 1 +#define FALSE 0 + +#define KB (1024) +#define MB (KB * 1024) +#define GB (MB * 1024) +#define TB (GB * 1024) + +/* All offsets, sizes and counts can be passed to the application in + * multiple ways. + * 1. a value (stored in val[0], val_count will be 1) + * 2. a comma separated list of values (stored in val[], using val_count) + * 3. a range and block sizes, low, high, factor (val_count must be 0) + */ +typedef struct pios_range_repeat { + uint64_t val[32]; /* Comma sep array, or low, high, inc */ + uint64_t val_count; /* Num of values */ + uint64_t val_low; + uint64_t val_high; + uint64_t val_inc_perc; + uint64_t next_val; /* Used for multiple runs in get_next() */ +} range_repeat_t; + +typedef struct cmd_args { + range_repeat_t T; /* Thread count */ + range_repeat_t N; /* Region count */ + range_repeat_t O; /* Offset count */ + range_repeat_t C; /* Chunksize */ + range_repeat_t S; /* Regionsize */ + + const char *pool; /* Pool */ + uint32_t flags; /* Flags */ + uint32_t io_type; /* DMUIO only */ + uint32_t verbose; /* Verbose */ + uint32_t human_readable; /* Human readable output */ + + uint64_t regionnoise; /* Region noise */ + uint64_t chunknoise; /* Chunk noise */ + uint64_t thread_delay; /* Thread delay */ + + char pre[KPIOS_PATH_SIZE]; /* Pre-exec hook */ + char post[KPIOS_PATH_SIZE]; /* Post-exec hook */ + char log[KPIOS_PATH_SIZE]; /* Requested log dir */ + + /* Control */ + int current_id; + uint64_t current_T; + uint64_t current_N; + uint64_t current_C; + uint64_t current_S; + uint64_t current_O; + + uint32_t rc; +} cmd_args_t; + +int set_count(char *pattern1, char *pattern2, range_repeat_t *range, + char *optarg, uint32_t *flags, char *arg); +int set_lhi(char *pattern, range_repeat_t *range, char *optarg, + int flag, uint32_t *flag_thread, char *arg); +int set_noise(uint64_t *noise, char *optarg, char *arg); +int set_load_params(cmd_args_t *args, char *optarg); +int check_mutual_exclusive_command_lines(uint32_t flag, char *arg); +void print_stats_header(void); +void print_stats(cmd_args_t *args, kpios_cmd_t *cmd); + +#endif /* _ZPIOS_H */ diff --git a/zfs/zcmd/zpios/zpios_main.c b/zfs/zcmd/zpios/zpios_main.c new file mode 100644 index 0000000000..e1e988680a --- /dev/null +++ b/zfs/zcmd/zpios/zpios_main.c @@ -0,0 +1,619 @@ +/* + * This file is part of the ZFS Linux port. + * + * Copyright (c) 2008 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory + * Written by: + * Brian Behlendorf , + * Herb Wartens , + * Jim Garlick + * LLNL-CODE-403049 + * + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + * + * Kernel PIOS DMU implemenation originally derived from PIOS test code. + * Character control interface derived from SPL code. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zpios.h" + +static const char short_opt[] = "t:l:h:e:n:i:j:k:c:u:a:b:g:L:P:R:I:" + "G:T:Vzs:A:B:C:o:m:q:r:fwxdp:v?"; +static const struct option long_opt[] = { + {"chunksize", required_argument, 0, 'c' }, + {"chunksize_low", required_argument, 0, 'a' }, + {"chunksize_high", required_argument, 0, 'b' }, + {"chunksize_incr", required_argument, 0, 'g' }, + {"offset", required_argument, 0, 'o' }, + {"offset_low", required_argument, 0, 'm' }, + {"offset_high", required_argument, 0, 'q' }, + {"offset_incr", required_argument, 0, 'r' }, + {"regioncount", required_argument, 0, 'n' }, + {"regioncount_low", required_argument, 0, 'i' }, + {"regioncount_high", required_argument, 0, 'j' }, + {"regioncount_incr", required_argument, 0, 'k' }, + {"threadcount", required_argument, 0, 't' }, + {"threadcount_low", required_argument, 0, 'l' }, + {"threadcount_high", required_argument, 0, 'h' }, + {"threadcount_incr", required_argument, 0, 'e' }, + {"regionsize", required_argument, 0, 's' }, + {"regionsize_low", required_argument, 0, 'A' }, + {"regionsize_high", required_argument, 0, 'B' }, + {"regionsize_incr", required_argument, 0, 'C' }, + {"cleanup", no_argument, 0, 'x' }, + {"verify", no_argument, 0, 'V' }, + {"zerocopy", no_argument, 0, 'z' }, + {"threaddelay", required_argument, 0, 'T' }, + {"regionnoise", required_argument, 0, 'I' }, + {"chunknoise", required_argument, 0, 'N' }, + {"prerun", required_argument, 0, 'P' }, + {"postrun", required_argument, 0, 'R' }, + {"log", required_argument, 0, 'G' }, + {"path", required_argument, 0, 'p' }, + {"pool", required_argument, 0, 'p' }, + {"load", required_argument, 0, 'L' }, + {"human-readable", no_argument, 0, 'H' }, + {"help", no_argument, 0, '?' }, + {"verbose", no_argument, 0, 'v' }, + { 0, 0, 0, 0 }, +}; + +static int zpiosctl_fd; /* Control file descriptor */ +static char zpios_version[VERSION_SIZE]; /* Kernel version string */ +static char *zpios_buffer = NULL; /* Scratch space area */ +static int zpios_buffer_size = 0; /* Scratch space size */ + +static int +usage(void) +{ + fprintf(stderr, "Usage: zpios\n"); + fprintf(stderr, + " --chunksize -c =values\n" + " --chunksize_low -a =value\n" + " --chunksize_high -b =value\n" + " --chunksize_incr -g =value\n" + " --offset -o =values\n" + " --offset_low -m =value\n" + " --offset_high -q =value\n" + " --offset_incr -r =value\n" + " --regioncount -n =values\n" + " --regioncount_low -i =value\n" + " --regioncount_high -j =value\n" + " --regioncount_incr -k =value\n" + " --threadcount -t =values\n" + " --threadcount_low -l =value\n" + " --threadcount_high -h =value\n" + " --threadcount_incr -e =value\n" + " --regionsize -s =values\n" + " --regionsize_low -A =value\n" + " --regionsize_high -B =value\n" + " --regionsize_incr -C =value\n" + " --cleanup -x\n" + " --verify -V\n" + " --zerocopy -z\n" + " --threaddelay -T =jiffies\n" + " --regionnoise -I =shift\n" + " --chunknoise -N =bytes\n" + " --prerun -P =pre-command\n" + " --postrun -R =post-command\n" + " --log -G =log directory\n" + " --pool | --path -p =pool name\n" + " --load -L =dmuio\n" + " --human-readable -H\n" + " --help -? =this help\n" + " --verbose -v =increase verbosity\n\n"); + + return 0; +} + +static void args_fini(cmd_args_t *args) +{ + assert(args != NULL); + free(args); +} + +static cmd_args_t * +args_init(int argc, char **argv) +{ + cmd_args_t *args; + uint32_t fl_th = 0; + uint32_t fl_rc = 0; + uint32_t fl_of = 0; + uint32_t fl_rs = 0; + uint32_t fl_cs = 0; + int c, rc; + + if (argc == 1) { + usage(); + return (cmd_args_t *)NULL; + } + + /* Configure and populate the args structures */ + args = malloc(sizeof(*args)); + if (args == NULL) + return NULL; + + memset(args, 0, sizeof(*args)); + + while ((c=getopt_long(argc, argv, short_opt, long_opt, NULL)) != -1) { + rc = 0; + + switch (c) { + case 'v': /* --verbose */ + args->verbose++; + break; + case 't': /* --thread count */ + rc = set_count(REGEX_NUMBERS, REGEX_NUMBERS_COMMA, + &args->T, optarg, &fl_th, "threadcount"); + break; + case 'l': /* --threadcount_low */ + rc = set_lhi(REGEX_NUMBERS, &args->T, optarg, + FLAG_LOW, &fl_th, "threadcount_low"); + break; + case 'h': /* --threadcount_high */ + rc = set_lhi(REGEX_NUMBERS, &args->T, optarg, + FLAG_HIGH, &fl_th, "threadcount_high"); + break; + case 'e': /* --threadcount_inc */ + rc = set_lhi(REGEX_NUMBERS, &args->T, optarg, + FLAG_INCR, &fl_th, "threadcount_incr"); + break; + case 'n': /* --regioncount */ + rc = set_count(REGEX_NUMBERS, REGEX_NUMBERS_COMMA, + &args->N, optarg, &fl_rc, "regioncount"); + break; + case 'i': /* --regioncount_low */ + rc = set_lhi(REGEX_NUMBERS, &args->N, optarg, + FLAG_LOW, &fl_rc, "regioncount_low"); + break; + case 'j': /* --regioncount_high */ + rc = set_lhi(REGEX_NUMBERS, &args->N, optarg, + FLAG_HIGH, &fl_rc, "regioncount_high"); + break; + case 'k': /* --regioncount_inc */ + rc = set_lhi(REGEX_NUMBERS, &args->N, optarg, + FLAG_INCR, &fl_rc, "regioncount_incr"); + break; + case 'o': /* --offset */ + rc = set_count(REGEX_SIZE, REGEX_SIZE_COMMA, + &args->O, optarg, &fl_of, "offset"); + break; + case 'm': /* --offset_low */ + rc = set_lhi(REGEX_SIZE, &args->O, optarg, + FLAG_LOW, &fl_of, "offset_low"); + break; + case 'q': /* --offset_high */ + rc = set_lhi(REGEX_SIZE, &args->O, optarg, + FLAG_HIGH, &fl_of, "offset_high"); + break; + case 'r': /* --offset_inc */ + rc = set_lhi(REGEX_NUMBERS, &args->O, optarg, + FLAG_INCR, &fl_of, "offset_incr"); + break; + case 'c': /* --chunksize */ + rc = set_count(REGEX_SIZE, REGEX_SIZE_COMMA, + &args->C, optarg, &fl_cs, "chunksize"); + break; + case 'a': /* --chunksize_low */ + rc = set_lhi(REGEX_SIZE, &args->C, optarg, + FLAG_LOW, &fl_cs, "chunksize_low"); + break; + case 'b': /* --chunksize_high */ + rc = set_lhi(REGEX_SIZE, &args->C, optarg, + FLAG_HIGH, &fl_cs, "chunksize_high"); + break; + case 'g': /* --chunksize_inc */ + rc = set_lhi(REGEX_NUMBERS, &args->C, optarg, + FLAG_INCR, &fl_cs, "chunksize_incr"); + break; + case 's': /* --regionsize */ + rc = set_count(REGEX_SIZE, REGEX_SIZE_COMMA, + &args->S, optarg, &fl_rs, "regionsize"); + break; + case 'A': /* --regionsize_low */ + rc = set_lhi(REGEX_SIZE, &args->S, optarg, + FLAG_LOW, &fl_rs, "regionsize_low"); + break; + case 'B': /* --regionsize_high */ + rc = set_lhi(REGEX_SIZE, &args->S, optarg, + FLAG_HIGH, &fl_rs, "regionsize_high"); + break; + case 'C': /* --regionsize_inc */ + rc = set_lhi(REGEX_NUMBERS, &args->S, optarg, + FLAG_INCR, &fl_rs, "regionsize_incr"); + break; + case 'L': /* --load */ + rc = set_load_params(args, optarg); + break; + case 'p': /* --pool */ + args->pool = optarg; + break; + case 'x': /* --cleanup */ + args->flags |= DMU_REMOVE; + break; + case 'P': /* --prerun */ + strncpy(args->pre, optarg, KPIOS_PATH_SIZE - 1); + break; + case 'R': /* --postrun */ + strncpy(args->post, optarg, KPIOS_PATH_SIZE - 1); + break; + case 'G': /* --log */ + strncpy(args->log, optarg, KPIOS_PATH_SIZE - 1); + break; + case 'I': /* --regionnoise */ + rc = set_noise(&args->regionnoise, optarg, "regionnoise"); + break; + case 'N': /* --chunknoise */ + rc = set_noise(&args->chunknoise, optarg, "chunknoise"); + break; + case 'T': /* --threaddelay */ + rc = set_noise(&args->thread_delay, optarg, "threaddelay"); + break; + case 'V': /* --verify */ + args->flags |= DMU_VERIFY; + break; + case 'z': /* --verify */ + args->flags |= (DMU_WRITE_ZC | DMU_READ_ZC); + break; + case 'H': + args->human_readable = 1; + break; + case '?': + rc = 1; + break; + default: + fprintf(stderr,"Unknown option '%s'\n",argv[optind-1]); + rc = EINVAL; + break; + } + + if (rc) { + usage(); + args_fini(args); + return NULL; + } + } + + check_mutual_exclusive_command_lines(fl_th, "threadcount"); + check_mutual_exclusive_command_lines(fl_rc, "regioncount"); + check_mutual_exclusive_command_lines(fl_of, "offset"); + check_mutual_exclusive_command_lines(fl_rs, "regionsize"); + check_mutual_exclusive_command_lines(fl_cs, "chunksize"); + + if (args->pool == NULL) { + fprintf(stderr, "Error: Pool not specificed\n"); + usage(); + args_fini(args); + return NULL; + } + + if ((args->flags & (DMU_WRITE_ZC | DMU_READ_ZC)) && + (args->flags & DMU_VERIFY)) { + fprintf(stderr, "Error, --zerocopy incompatible --verify, " + "used for performance analysis only\n"); + usage(); + args_fini(args); + return NULL; + } + + return args; +} + +static int +dev_clear(void) +{ + kpios_cfg_t cfg; + int rc; + + memset(&cfg, 0, sizeof(cfg)); + cfg.cfg_magic = KPIOS_CFG_MAGIC; + cfg.cfg_cmd = KPIOS_CFG_BUFFER_CLEAR; + cfg.cfg_arg1 = 0; + + rc = ioctl(zpiosctl_fd, KPIOS_CFG, &cfg); + if (rc) + fprintf(stderr, "Ioctl() error %lu / %d: %d\n", + (unsigned long) KPIOS_CFG, cfg.cfg_cmd, errno); + + lseek(zpiosctl_fd, 0, SEEK_SET); + + return rc; +} + +/* Passing a size of zero simply results in querying the current size */ +static int +dev_size(int size) +{ + kpios_cfg_t cfg; + int rc; + + memset(&cfg, 0, sizeof(cfg)); + cfg.cfg_magic = KPIOS_CFG_MAGIC; + cfg.cfg_cmd = KPIOS_CFG_BUFFER_SIZE; + cfg.cfg_arg1 = size; + + rc = ioctl(zpiosctl_fd, KPIOS_CFG, &cfg); + if (rc) { + fprintf(stderr, "Ioctl() error %lu / %d: %d\n", + (unsigned long) KPIOS_CFG, cfg.cfg_cmd, errno); + return rc; + } + + return cfg.cfg_rc1; +} + +static void +dev_fini(void) +{ + if (zpios_buffer) + free(zpios_buffer); + + if (zpiosctl_fd != -1) { + if (close(zpiosctl_fd) == -1) { + fprintf(stderr, "Unable to close %s: %d\n", + KPIOS_DEV, errno); + } + } +} + +static int +dev_init(void) +{ + int rc; + + zpiosctl_fd = open(KPIOS_DEV, O_RDONLY); + if (zpiosctl_fd == -1) { + fprintf(stderr, "Unable to open %s: %d\n" + "Is the zpios module loaded?\n", KPIOS_DEV, errno); + rc = errno; + goto error; + } + + if ((rc = dev_clear())) + goto error; + + if ((rc = dev_size(0)) < 0) + goto error; + + zpios_buffer_size = rc; + zpios_buffer = (char *)malloc(zpios_buffer_size); + if (zpios_buffer == NULL) { + rc = ENOMEM; + goto error; + } + + memset(zpios_buffer, 0, zpios_buffer_size); + return 0; +error: + if (zpiosctl_fd != -1) { + if (close(zpiosctl_fd) == -1) { + fprintf(stderr, "Unable to close %s: %d\n", + KPIOS_DEV, errno); + } + } + + return rc; +} + +static int +get_next(uint64_t *val, range_repeat_t *range) +{ + int i; + + /* if low, incr, high is given */ + if (range->val_count == 0) { + *val = (range->val_low) + + (range->val_low * range->next_val / 100); + + if (*val > range->val_high) + return 0; /* No more values, limit exceeded */ + + if (!range->next_val) + range->next_val = range->val_inc_perc; + else + range->next_val = range->next_val+range->val_inc_perc; + + return 1; /* more values to come */ + + /* if only one val is given */ + } else if (range->val_count == 1) { + if (range->next_val) + return 0; /* No more values, we only have one */ + + *val = range->val[0]; + range->next_val = 1; + return 1; /* more values to come */ + + /* if comma separated values are given */ + } else if (range->val_count > 1) { + if (range->next_val > range->val_count - 1) + return 0; /* No more values, limit exceeded */ + + *val = range->val[range->next_val]; + range->next_val++; + return 1; /* more values to come */ + } + + return 0; +} + +static int +run_one(cmd_args_t *args, uint32_t id, uint32_t T, uint32_t N, + uint64_t C, uint64_t S, uint64_t O) +{ + kpios_cmd_t *cmd; + int rc, rc2, cmd_size; + + dev_clear(); + + cmd_size = sizeof(kpios_cmd_t) + ((T + N + 1) * sizeof(kpios_stats_t)); + cmd = (kpios_cmd_t *)malloc(cmd_size); + if (cmd == NULL) + return ENOMEM; + + memset(cmd, 0, cmd_size); + cmd->cmd_magic = KPIOS_CMD_MAGIC; + strncpy(cmd->cmd_pool, args->pool, KPIOS_NAME_SIZE - 1); + strncpy(cmd->cmd_pre, args->pre, KPIOS_PATH_SIZE - 1); + strncpy(cmd->cmd_post, args->post, KPIOS_PATH_SIZE - 1); + strncpy(cmd->cmd_log, args->log, KPIOS_PATH_SIZE - 1); + cmd->cmd_id = id; + cmd->cmd_chunk_size = C; + cmd->cmd_thread_count = T; + cmd->cmd_region_count = N; + cmd->cmd_region_size = S; + cmd->cmd_offset = O; + cmd->cmd_region_noise = args->regionnoise; + cmd->cmd_chunk_noise = args->chunknoise; + cmd->cmd_thread_delay = args->thread_delay; + cmd->cmd_flags = args->flags; + cmd->cmd_data_size = (T + N + 1) * sizeof(kpios_stats_t); + + rc = ioctl(zpiosctl_fd, KPIOS_CMD, cmd); + if (rc) + args->rc = errno; + + print_stats(args, cmd); + + if (args->verbose) { + rc2 = read(zpiosctl_fd, zpios_buffer, zpios_buffer_size - 1); + if (rc2 < 0) { + fprintf(stdout, "Error reading results: %d\n", rc2); + } else if ((rc2 > 0) && (strlen(zpios_buffer) > 0)) { + fprintf(stdout, "\n%s\n", zpios_buffer); + fflush(stdout); + } + } + + free(cmd); + + return rc; +} + +static int +run_offsets(cmd_args_t *args) +{ + int rc = 0; + + while (rc == 0 && get_next(&args->current_O, &args->O)) { + rc = run_one(args, args->current_id, + args->current_T, args->current_N, args->current_C, + args->current_S, args->current_O); + args->current_id++; + } + + args->O.next_val = 0; + return rc; +} + +static int +run_region_counts(cmd_args_t *args) +{ + int rc = 0; + + while (rc == 0 && get_next((uint64_t *)&args->current_N, &args->N)) + rc = run_offsets(args); + + args->N.next_val = 0; + return rc; +} + +static int +run_region_sizes(cmd_args_t *args) +{ + int rc = 0; + + while (rc == 0 && get_next(&args->current_S, &args->S)) { + if (args->current_S < args->current_C) { + fprintf(stderr, "Error: in any run chunksize can " + "not be smaller than regionsize.\n"); + return EINVAL; + } + + rc = run_region_counts(args); + } + + args->S.next_val = 0; + return rc; +} + +static int +run_chunk_sizes(cmd_args_t *args) +{ + int rc = 0; + + while (rc == 0 && get_next(&args->current_C, &args->C)) { + rc = run_region_sizes(args); + } + + args->C.next_val = 0; + return rc; +} + +static int +run_thread_counts(cmd_args_t *args) +{ + int rc = 0; + + while (rc == 0 && get_next((uint64_t *)&args->current_T, &args->T)) + rc = run_chunk_sizes(args); + + return rc; +} + +int +main(int argc, char **argv) +{ + cmd_args_t *args; + int rc = 0; + + /* Argument init and parsing */ + if ((args = args_init(argc, argv)) == NULL) { + rc = -1; + goto out; + } + + /* Device specific init */ + if ((rc = dev_init())) + goto out; + + /* Generic kernel version string */ + if (args->verbose) + fprintf(stdout, "%s", zpios_version); + + print_stats_header(); + rc = run_thread_counts(args); +out: + if (args != NULL) + args_fini(args); + + dev_fini(); + return rc; +} diff --git a/zfs/zcmd/zpios/zpios_util.c b/zfs/zcmd/zpios/zpios_util.c new file mode 100644 index 0000000000..4d7e52d648 --- /dev/null +++ b/zfs/zcmd/zpios/zpios_util.c @@ -0,0 +1,440 @@ +/* + * This file is part of the ZFS Linux port. + * + * Copyright (c) 2008 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory + * Written by: + * Brian Behlendorf , + * Herb Wartens , + * Jim Garlick + * LLNL-CODE-403049 + * + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + * + * Kernel PIOS DMU implemenation originally derived from PIOS test code. + * Character control interface derived from SPL code. + */ + +#include +#include +#include +#include +#include +#include +#include "zpios.h" + +/* extracts an unsigned int (64) and K,M,G,T from the string */ +/* and returns a 64 bit value converted to the proper units */ +static int +kmgt_to_uint64(const char *str, uint64_t *val) +{ + char *endptr; + int rc = 0; + + *val = strtoll(str, &endptr, 0); + if ((str == endptr) && (*val == 0)) + return EINVAL; + + switch (endptr[0]) { + case 'k': case 'K': + *val = (*val) << 10; + break; + case 'm': case 'M': + *val = (*val) << 20; + break; + case 'g': case 'G': + *val = (*val) << 30; + break; + case 't': case 'T': + *val = (*val) << 40; + break; + case '\0': + break; + default: + rc = EINVAL; + } + + return rc; +} + +static char * +uint64_to_kmgt(char *str, uint64_t val) +{ + char postfix[] = "kmgt"; + int i = -1; + + while ((val >= KB) && (i < 4)) { + val = (val >> 10); + i++; + } + + if (i >= 4) + sprintf(str, "inf"); + else + sprintf(str, "%lu%c", (unsigned long)val, + (i == -1) ? '\0' : postfix[i]); + + return str; +} + +static char * +kmgt_per_sec(char *str, uint64_t v, double t) +{ + char postfix[] = "kmgt"; + double val = ((double)v) / t; + int i = -1; + + while ((val >= (double)KB) && (i < 4)) { + val /= (double)KB; + i++; + } + + if (i >= 4) + sprintf(str, "inf"); + else + sprintf(str, "%.2f%c", val, + (i == -1) ? '\0' : postfix[i]); + + return str; +} + +static char * +print_flags(char *str, uint32_t flags) +{ + str[0] = (flags & DMU_WRITE) ? 'w' : '-'; + str[1] = (flags & DMU_READ) ? 'r' : '-'; + str[2] = (flags & DMU_VERIFY) ? 'v' : '-'; + str[3] = (flags & DMU_REMOVE) ? 'c' : '-'; + str[4] = (flags & DMU_FPP) ? 'p' : 's'; + str[5] = (flags & (DMU_WRITE_ZC | DMU_READ_ZC)) ? 'z' : '-'; + str[6] = '\0'; + + return str; +} + +static double +timespec_to_double(struct timespec t) +{ + return ((double)(t.tv_sec) + + ((double)(t.tv_nsec) / (double)(1000*1000*1000))); +} + +static int +regex_match(const char *string, char *pattern) +{ + regex_t re; + int rc; + + rc = regcomp(&re, pattern, REG_EXTENDED | REG_NOSUB | REG_ICASE); + if (rc) { + fprintf(stderr, "Error: Couldn't do regcomp, %d\n", rc); + return rc; + } + + rc = regexec(&re, string, (size_t) 0, NULL, 0); + regfree(&re); + + return rc; +} + +/* fills the pios_range_repeat structure of comma separated values */ +static int +split_string(const char *optarg, char *pattern, range_repeat_t *range) +{ + const char comma[] = ","; + char *cp, *token[32]; + int rc, i = 0; + + if ((rc = regex_match(optarg, pattern))) + return rc; + + cp = strdup(optarg); + if (cp == NULL) + return ENOMEM; + + do { + /* STRTOK(3) Each subsequent call, with a null pointer as the + * value of the * first argument, starts searching from the + * saved pointer and behaves as described above. + */ + token[i] = strtok(cp, comma); + cp = NULL; + } while ((token[i++] != NULL) && (i < 32)); + + range->val_count = i - 1; + + for (i = 0; i < range->val_count; i++) + kmgt_to_uint64(token[i], &range->val[i]); + + free(cp); + return 0; +} + +int +set_count(char *pattern1, char *pattern2, range_repeat_t *range, + char *optarg, uint32_t *flags, char *arg) +{ + if (flags) + *flags |= FLAG_SET; + + range->next_val = 0; + + if (regex_match(optarg, pattern1) == 0) { + kmgt_to_uint64(optarg, &range->val[0]); + range->val_count = 1; + } else if (split_string(optarg, pattern2, range) < 0) { + fprintf(stderr, "Error: Incorrect pattern for %s, '%s'\n", + arg, optarg); + return EINVAL; + } + + return 0; +} + +/* validates the value with regular expression and sets low, high, incr + * according to value at which flag will be set. Sets the flag after. */ +int +set_lhi(char *pattern, range_repeat_t *range, char *optarg, + int flag, uint32_t *flag_thread, char *arg) +{ + int rc; + + if ((rc = regex_match(optarg, pattern))) { + fprintf(stderr, "Error: Wrong pattern in %s, '%s'\n", + arg, optarg); + return rc; + } + + switch (flag) { + case FLAG_LOW: + kmgt_to_uint64(optarg, &range->val_low); + break; + case FLAG_HIGH: + kmgt_to_uint64(optarg, &range->val_high); + break; + case FLAG_INCR: + kmgt_to_uint64(optarg, &range->val_inc_perc); + break; + default: + assert(0); + } + + *flag_thread |= flag; + + return 0; +} + +int +set_noise(uint64_t *noise, char *optarg, char *arg) +{ + if (regex_match(optarg, REGEX_NUMBERS) == 0) { + kmgt_to_uint64(optarg, noise); + } else { + fprintf(stderr, "Error: Incorrect pattern for %s\n", arg); + return EINVAL; + } + + return 0; +} + +int +set_load_params(cmd_args_t *args, char *optarg) +{ + char *param, *search, comma[] = ","; + int rc = 0; + + search = strdup(optarg); + if (search == NULL) + return ENOMEM; + + while ((param = strtok(search, comma)) != NULL) { + search = NULL; + + if (strcmp("fpp", param) == 0) { + args->flags |= DMU_FPP; /* File Per Process/Thread */ + } else if (strcmp("sff", param) == 0) { + args->flags &= ~DMU_FPP; /* Shared Shared File */ + } else if (strcmp("dmuio", param) == 0) { + args->io_type |= DMU_IO; + args->flags |= (DMU_WRITE | DMU_READ); + } else { + fprintf(stderr, "Invalid load: %s\n", param); + rc = EINVAL; + } + } + + free(search); + + return rc; +} + + +/* checks the low, high, increment values against the single value for + * mutual exclusion, for e.g threadcount is mutually exclusive to + * threadcount_low, ..._high, ..._incr */ +int +check_mutual_exclusive_command_lines(uint32_t flag, char *arg) +{ + if ((flag & FLAG_SET) && (flag & (FLAG_LOW | FLAG_HIGH | FLAG_INCR))) { + fprintf(stderr, "Error: --%s can not be given with --%s_low, " + "--%s_high or --%s_incr.\n", arg, arg, arg, arg); + return 0; + } + + if ((flag & (FLAG_LOW | FLAG_HIGH | FLAG_INCR)) && !(flag & FLAG_SET)) { + if (flag != (FLAG_LOW | FLAG_HIGH | FLAG_INCR)) { + fprintf(stderr, "Error: One or more values missing " + "from --%s_low, --%s_high, --%s_incr.\n", + arg, arg, arg); + return 0; + } + } + + return 1; +} + +void +print_stats_header(void) +{ + printf("ret-code id\tth-cnt\trg-cnt\trg-sz\tch-sz\toffset\trg-no\tch-no\t" + "th-dly\tflags\ttime\tcr-time\trm-time\twr-time\t" + "rd-time\twr-data\twr-ch\twr-bw\trd-data\trd-ch\trd-bw\n"); + printf("------------------------------------------------------------" + "------------------------------------------------------------" + "-----------------------------------------------------------\n"); +} + +static void +print_stats_human_readable(cmd_args_t *args, kpios_cmd_t *cmd) +{ + kpios_stats_t *summary_stats; + double t_time, wr_time, rd_time, cr_time, rm_time; + char str[16]; + + if (args->rc) + printf("FAILED: %3d ", args->rc); + else + printf("PASSED: %3d ", args->rc); + + printf("%u\t", cmd->cmd_id); + printf("%u\t", cmd->cmd_thread_count); + printf("%u\t", cmd->cmd_region_count); + printf("%s\t", uint64_to_kmgt(str, cmd->cmd_region_size)); + printf("%s\t", uint64_to_kmgt(str, cmd->cmd_chunk_size)); + printf("%s\t", uint64_to_kmgt(str, cmd->cmd_offset)); + printf("%s\t", uint64_to_kmgt(str, cmd->cmd_region_noise)); + printf("%s\t", uint64_to_kmgt(str, cmd->cmd_chunk_noise)); + printf("%s\t", uint64_to_kmgt(str, cmd->cmd_thread_delay)); + printf("%s\t", print_flags(str, cmd->cmd_flags)); + + if (args->rc) { + printf("\n"); + return; + } + + summary_stats = (kpios_stats_t *)cmd->cmd_data_str; + t_time = timespec_to_double(summary_stats->total_time.delta); + wr_time = timespec_to_double(summary_stats->wr_time.delta); + rd_time = timespec_to_double(summary_stats->rd_time.delta); + cr_time = timespec_to_double(summary_stats->cr_time.delta); + rm_time = timespec_to_double(summary_stats->rm_time.delta); + + printf("%.2f\t", t_time); + printf("%.3f\t", cr_time); + printf("%.3f\t", rm_time); + printf("%.2f\t", wr_time); + printf("%.2f\t", rd_time); + + printf("%s\t", uint64_to_kmgt(str, summary_stats->wr_data)); + printf("%s\t", uint64_to_kmgt(str, summary_stats->wr_chunks)); + printf("%s\t", kmgt_per_sec(str, summary_stats->wr_data, wr_time)); + + printf("%s\t", uint64_to_kmgt(str, summary_stats->rd_data)); + printf("%s\t", uint64_to_kmgt(str, summary_stats->rd_chunks)); + printf("%s\n", kmgt_per_sec(str, summary_stats->rd_data, rd_time)); + fflush(stdout); +} + +static void +print_stats_table(cmd_args_t *args, kpios_cmd_t *cmd) +{ + kpios_stats_t *summary_stats; + double wr_time, rd_time; + + if (args->rc) + printf("FAILED: %3d ", args->rc); + else + printf("PASSED: %3d ", args->rc); + + printf("%u\t", cmd->cmd_id); + printf("%u\t", cmd->cmd_thread_count); + printf("%u\t", cmd->cmd_region_count); + printf("%llu\t", (long long unsigned)cmd->cmd_region_size); + printf("%llu\t", (long long unsigned)cmd->cmd_chunk_size); + printf("%llu\t", (long long unsigned)cmd->cmd_offset); + printf("%u\t", cmd->cmd_region_noise); + printf("%u\t", cmd->cmd_chunk_noise); + printf("%u\t", cmd->cmd_thread_delay); + printf("0x%x\t", cmd->cmd_flags); + + if (args->rc) { + printf("\n"); + return; + } + + summary_stats = (kpios_stats_t *)cmd->cmd_data_str; + wr_time = timespec_to_double(summary_stats->wr_time.delta); + rd_time = timespec_to_double(summary_stats->rd_time.delta); + + printf("%ld.%02ld\t", + summary_stats->total_time.delta.tv_sec, + summary_stats->total_time.delta.tv_nsec); + printf("%ld.%02ld\t", + summary_stats->cr_time.delta.tv_sec, + summary_stats->cr_time.delta.tv_nsec); + printf("%ld.%02ld\t", + summary_stats->rm_time.delta.tv_sec, + summary_stats->rm_time.delta.tv_nsec); + printf("%ld.%02ld\t", + summary_stats->wr_time.delta.tv_sec, + summary_stats->wr_time.delta.tv_nsec); + printf("%ld.%02ld\t", + summary_stats->rd_time.delta.tv_sec, + summary_stats->rd_time.delta.tv_nsec); + + printf("%lld\t", (long long unsigned)summary_stats->wr_data); + printf("%lld\t", (long long unsigned)summary_stats->wr_chunks); + printf("%.4f\t", (double)summary_stats->wr_data / wr_time); + + printf("%lld\t", (long long unsigned)summary_stats->rd_data); + printf("%lld\t", (long long unsigned)summary_stats->rd_chunks); + printf("%.4f\n", (double)summary_stats->rd_data / rd_time); + fflush(stdout); +} + +void +print_stats(cmd_args_t *args, kpios_cmd_t *cmd) +{ + if (args->human_readable) + print_stats_human_readable(args, cmd); + else + print_stats_table(args, cmd); +}