/* * This file is part of the ZFS Linux port. * * Copyright (c) 2008 Lawrence Livermore National Security, LLC. * Produced at Lawrence Livermore National Laboratory * Written by: * Brian Behlendorf , * Herb Wartens , * Jim Garlick * LLNL-CODE-403049 * * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License, Version 1.0 only * (the "License"). You may not use this file except in compliance * with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END * * Kernel PIOS DMU implemenation originally derived from PIOS test code. * Character control interface derived from SPL code. */ #include #include #include #include #include "zpios-internal.h" static struct class *kpios_class; static inline struct timespec timespec_add(struct timespec lhs, struct timespec rhs) { struct timespec ts_delta; set_normalized_timespec(&ts_delta, lhs.tv_sec + rhs.tv_sec, lhs.tv_nsec + rhs.tv_nsec); return ts_delta; } static int kpios_upcall(char *path, char *phase, run_args_t *run_args, int rc) { /* This is stack heavy but it should be OK since we are only * making the upcall between tests when the stack is shallow. */ char id[16], chunk_size[16], region_size[16], thread_count[16]; char region_count[16], offset[16], region_noise[16], chunk_noise[16]; char thread_delay[16], flags[16], result[8]; char *argv[16], *envp[4]; if (strlen(path) == 0) return -ENOENT; snprintf(id, 15, "%d", run_args->id); snprintf(chunk_size, 15, "%lu", (long unsigned)run_args->chunk_size); snprintf(region_size, 15, "%lu",(long unsigned) run_args->region_size); snprintf(thread_count, 15, "%u", run_args->thread_count); snprintf(region_count, 15, "%u", run_args->region_count); snprintf(offset, 15, "%lu", (long unsigned)run_args->offset); snprintf(region_noise, 15, "%u", run_args->region_noise); snprintf(chunk_noise, 15, "%u", run_args->chunk_noise); snprintf(thread_delay, 15, "%u", run_args->thread_delay); snprintf(flags, 15, "0x%x", run_args->flags); snprintf(result, 7, "%d", rc); /* Passing 15 args to registered pre/post upcall */ argv[0] = path; argv[1] = phase; argv[2] = strlen(run_args->log) ? run_args->log : ""; argv[3] = id; argv[4] = run_args->pool; argv[5] = chunk_size; argv[6] = region_size; argv[7] = thread_count; argv[8] = region_count; argv[9] = offset; argv[10] = region_noise; argv[11] = chunk_noise; argv[12] = thread_delay; argv[13] = flags; argv[14] = result; argv[15] = NULL; /* Passing environment for userspace upcall */ envp[0] = "HOME=/"; envp[1] = "TERM=linux"; envp[2] = "PATH=/sbin:/usr/sbin:/bin:/usr/bin"; envp[3] = NULL; return call_usermodehelper(path, argv, envp, 1); } static uint64_t kpios_dmu_object_create(run_args_t *run_args, objset_t *os) { struct dmu_tx *tx; uint64_t obj = 0ULL; int rc; tx = dmu_tx_create(os); dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, OBJ_SIZE); rc = dmu_tx_assign(tx, TXG_WAIT); if (rc) { kpios_print(run_args->file, "dmu_tx_assign() failed: %d\n", rc); dmu_tx_abort(tx); return obj; } obj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0, DMU_OT_NONE, 0, tx); rc = dmu_object_set_blocksize(os, obj, 128ULL << 10, 0, tx); if (rc) { kpios_print(run_args->file, "dmu_object_set_blocksize() failed: %d\n", rc); dmu_tx_abort(tx); return obj; } dmu_tx_commit(tx); return obj; } static int kpios_dmu_object_free(run_args_t *run_args, objset_t *os, uint64_t obj) { struct dmu_tx *tx; int rc; tx = dmu_tx_create(os); dmu_tx_hold_free(tx, obj, 0, DMU_OBJECT_END); rc = dmu_tx_assign(tx, TXG_WAIT); if (rc) { kpios_print(run_args->file, "dmu_tx_assign() failed: %d\n", rc); dmu_tx_abort(tx); return rc; } rc = dmu_object_free(os, obj, tx); if (rc) { kpios_print(run_args->file, "dmu_object_free() failed: %d\n", rc); dmu_tx_abort(tx); return rc; } dmu_tx_commit(tx); return 0; } static int kpios_dmu_setup(run_args_t *run_args) { kpios_time_t *t = &(run_args->stats.cr_time); objset_t *os; uint64_t obj = 0ULL; int i, rc = 0; t->start = current_kernel_time(); rc = dmu_objset_open(run_args->pool, DMU_OST_ZFS, DS_MODE_STANDARD, &os); if (rc) { kpios_print(run_args->file, "Error dmu_objset_open() " "failed: %d\n", rc); goto out; } if (!(run_args->flags & DMU_FPP)) { obj = kpios_dmu_object_create(run_args, os); if (obj == 0) { rc = -EBADF; kpios_print(run_args->file, "Error kpios_dmu_" "object_create() failed, %d\n", rc); goto out; } } for (i = 0; i < run_args->region_count; i++) { kpios_region_t *region; region = &run_args->regions[i]; mutex_init(®ion->lock, NULL, MUTEX_DEFAULT, NULL); if (run_args->flags & DMU_FPP) { /* File per process */ region->obj.os = os; region->obj.obj = kpios_dmu_object_create(run_args, os); ASSERT(region->obj.obj > 0); /* XXX - Handle this */ region->wr_offset = run_args->offset; region->rd_offset = run_args->offset; region->init_offset = run_args->offset; region->max_offset = run_args->offset + run_args->region_size; } else { /* Single shared file */ region->obj.os = os; region->obj.obj = obj; region->wr_offset = run_args->offset * i; region->rd_offset = run_args->offset * i; region->init_offset = run_args->offset * i; region->max_offset = run_args->offset * i + run_args->region_size; } } run_args->os = os; out: t->stop = current_kernel_time(); t->delta = timespec_sub(t->stop, t->start); return rc; } static int kpios_setup_run(run_args_t **run_args, kpios_cmd_t *kcmd, struct file *file) { run_args_t *ra; int rc, size; size = sizeof(*ra) + kcmd->cmd_region_count * sizeof(kpios_region_t); ra = vmem_zalloc(size, KM_SLEEP); if (ra == NULL) { kpios_print(file, "Unable to vmem_zalloc() %d bytes " "for regions\n", size); return -ENOMEM; } *run_args = ra; strncpy(ra->pool, kcmd->cmd_pool, KPIOS_NAME_SIZE - 1); strncpy(ra->pre, kcmd->cmd_pre, KPIOS_PATH_SIZE - 1); strncpy(ra->post, kcmd->cmd_post, KPIOS_PATH_SIZE - 1); strncpy(ra->log, kcmd->cmd_log, KPIOS_PATH_SIZE - 1); ra->id = kcmd->cmd_id; ra->chunk_size = kcmd->cmd_chunk_size; ra->thread_count = kcmd->cmd_thread_count; ra->region_count = kcmd->cmd_region_count; ra->region_size = kcmd->cmd_region_size; ra->offset = kcmd->cmd_offset; ra->region_noise = kcmd->cmd_region_noise; ra->chunk_noise = kcmd->cmd_chunk_noise; ra->thread_delay = kcmd->cmd_thread_delay; ra->flags = kcmd->cmd_flags; ra->stats.wr_data = 0; ra->stats.wr_chunks = 0; ra->stats.rd_data = 0; ra->stats.rd_chunks = 0; ra->region_next = 0; ra->file = file; mutex_init(&ra->lock_work, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ra->lock_ctl, NULL, MUTEX_DEFAULT, NULL); rc = kpios_dmu_setup(ra); if (rc) { mutex_destroy(&ra->lock_ctl); mutex_destroy(&ra->lock_work); vmem_free(ra, size); *run_args = NULL; } return rc; } static int kpios_get_work_item(run_args_t *run_args, dmu_obj_t *obj, __u64 *offset, __u32 *chunk_size, kpios_region_t **region, __u32 flags) { int i, j, count = 0; unsigned int random_int; get_random_bytes(&random_int, sizeof(unsigned int)); mutex_enter(&run_args->lock_work); i = run_args->region_next; /* XXX: I don't much care for this chunk selection mechansim * there's the potential to burn a lot of time here doing nothing * useful while holding the global lock. This could give some * misleading performance results. I'll fix it latter. */ while (count < run_args->region_count) { __u64 *rw_offset; kpios_time_t *rw_time; j = i % run_args->region_count; *region = &(run_args->regions[j]); if (flags & DMU_WRITE) { rw_offset = &((*region)->wr_offset); rw_time = &((*region)->stats.wr_time); } else { rw_offset = &((*region)->rd_offset); rw_time = &((*region)->stats.rd_time); } /* test if region is fully written */ if (*rw_offset + *chunk_size > (*region)->max_offset) { i++; count++; if (unlikely(rw_time->stop.tv_sec == 0) && unlikely(rw_time->stop.tv_nsec == 0)) rw_time->stop = current_kernel_time(); continue; } *offset = *rw_offset; *obj = (*region)->obj; *rw_offset += *chunk_size; /* update ctl structure */ if (run_args->region_noise) { get_random_bytes(&random_int, sizeof(unsigned int)); run_args->region_next += random_int % run_args->region_noise; } else { run_args->region_next++; } mutex_exit(&run_args->lock_work); return 1; } /* nothing left to do */ mutex_exit(&run_args->lock_work); return 0; } static void kpios_remove_objects(run_args_t *run_args) { kpios_time_t *t = &(run_args->stats.rm_time); kpios_region_t *region; int rc = 0, i; t->start = current_kernel_time(); if (run_args->flags & DMU_REMOVE) { if (run_args->flags & DMU_FPP) { for (i = 0; i < run_args->region_count; i++) { region = &run_args->regions[i]; rc = kpios_dmu_object_free(run_args, region->obj.os, region->obj.obj); if (rc) kpios_print(run_args->file, "Error " "removing object %d, %d\n", (int)region->obj.obj, rc); } } else { region = &run_args->regions[0]; rc = kpios_dmu_object_free(run_args, region->obj.os, region->obj.obj); if (rc) kpios_print(run_args->file, "Error " "removing object %d, %d\n", (int)region->obj.obj, rc); } } dmu_objset_close(run_args->os); t->stop = current_kernel_time(); t->delta = timespec_sub(t->stop, t->start); } static void kpios_cleanup_run(run_args_t *run_args) { int i, size = 0; if (run_args == NULL) return; if (run_args->threads != NULL) { for (i = 0; i < run_args->thread_count; i++) { if (run_args->threads[i]) { mutex_destroy(&run_args->threads[i]->lock); kmem_free(run_args->threads[i], sizeof(thread_data_t)); } } kmem_free(run_args->threads, sizeof(thread_data_t *) * run_args->thread_count); } if (run_args->regions != NULL) for (i = 0; i < run_args->region_count; i++) mutex_destroy(&run_args->regions[i].lock); mutex_destroy(&run_args->lock_work); mutex_destroy(&run_args->lock_ctl); if (run_args->regions != NULL) size = run_args->region_count * sizeof(kpios_region_t); vmem_free(run_args, sizeof(*run_args) + size); } static int kpios_dmu_write(run_args_t *run_args, objset_t *os, uint64_t object, uint64_t offset, uint64_t size, const void *buf) { struct dmu_tx *tx; int rc, how = TXG_WAIT; int flags = 0; while (1) { tx = dmu_tx_create(os); dmu_tx_hold_write(tx, object, offset, size); rc = dmu_tx_assign(tx, how); if (rc) { if (rc == ERESTART && how == TXG_NOWAIT) { dmu_tx_wait(tx); dmu_tx_abort(tx); continue; } kpios_print(run_args->file, "Error in dmu_tx_assign(), %d", rc); dmu_tx_abort(tx); return rc; } break; } if (run_args->flags & DMU_WRITE_ZC) flags |= DMU_WRITE_ZEROCOPY; dmu_write_impl(os, object, offset, size, buf, tx, flags); dmu_tx_commit(tx); return 0; } static int kpios_dmu_read(run_args_t *run_args, objset_t *os, uint64_t object, uint64_t offset, uint64_t size, void *buf) { int flags = 0; if (run_args->flags & DMU_READ_ZC) flags |= DMU_READ_ZEROCOPY; return dmu_read_impl(os, object, offset, size, buf, flags); } static int kpios_thread_main(void *data) { thread_data_t *thr = (thread_data_t *)data; run_args_t *run_args = thr->run_args; kpios_time_t t; dmu_obj_t obj; __u64 offset; __u32 chunk_size; kpios_region_t *region; char *buf; unsigned int random_int; int chunk_noise = run_args->chunk_noise; int chunk_noise_tmp = 0; int thread_delay = run_args->thread_delay; int thread_delay_tmp = 0; int i, rc = 0; if (chunk_noise) { get_random_bytes(&random_int, sizeof(unsigned int)); chunk_noise_tmp = (random_int % (chunk_noise * 2)) - chunk_noise; } /* It's OK to vmem_alloc() this memory because it will be copied * in to the slab and pointers to the slab copy will be setup in * the bio when the IO is submitted. This of course is not ideal * since we want a zero-copy IO path if possible. It would be nice * to have direct access to those slab entries. */ chunk_size = run_args->chunk_size + chunk_noise_tmp; buf = (char *)vmem_alloc(chunk_size, KM_SLEEP); ASSERT(buf); /* Trivial data verification pattern for now. */ if (run_args->flags & DMU_VERIFY) memset(buf, 'z', chunk_size); /* Write phase */ mutex_enter(&thr->lock); thr->stats.wr_time.start = current_kernel_time(); mutex_exit(&thr->lock); while (kpios_get_work_item(run_args, &obj, &offset, &chunk_size, ®ion, DMU_WRITE)) { if (thread_delay) { get_random_bytes(&random_int, sizeof(unsigned int)); thread_delay_tmp = random_int % thread_delay; set_current_state(TASK_UNINTERRUPTIBLE); schedule_timeout(thread_delay_tmp); /* In jiffies */ } t.start = current_kernel_time(); rc = kpios_dmu_write(run_args, obj.os, obj.obj, offset, chunk_size, buf); t.stop = current_kernel_time(); t.delta = timespec_sub(t.stop, t.start); if (rc) { kpios_print(run_args->file, "IO error while doing " "dmu_write(): %d\n", rc); break; } mutex_enter(&thr->lock); thr->stats.wr_data += chunk_size; thr->stats.wr_chunks++; thr->stats.wr_time.delta = timespec_add( thr->stats.wr_time.delta, t.delta); mutex_exit(&thr->lock); mutex_enter(®ion->lock); region->stats.wr_data += chunk_size; region->stats.wr_chunks++; region->stats.wr_time.delta = timespec_add( region->stats.wr_time.delta, t.delta); /* First time region was accessed */ if (region->init_offset == offset) region->stats.wr_time.start = t.start; mutex_exit(®ion->lock); } mutex_enter(&run_args->lock_ctl); run_args->threads_done++; mutex_exit(&run_args->lock_ctl); mutex_enter(&thr->lock); thr->rc = rc; thr->stats.wr_time.stop = current_kernel_time(); mutex_exit(&thr->lock); wake_up(&run_args->waitq); set_current_state(TASK_UNINTERRUPTIBLE); schedule(); /* Check if we should exit */ mutex_enter(&thr->lock); rc = thr->rc; mutex_exit(&thr->lock); if (rc) goto out; /* Read phase */ mutex_enter(&thr->lock); thr->stats.rd_time.start = current_kernel_time(); mutex_exit(&thr->lock); while (kpios_get_work_item(run_args, &obj, &offset, &chunk_size, ®ion, DMU_READ)) { if (thread_delay) { get_random_bytes(&random_int, sizeof(unsigned int)); thread_delay_tmp = random_int % thread_delay; set_current_state(TASK_UNINTERRUPTIBLE); schedule_timeout(thread_delay_tmp); /* In jiffies */ } if (run_args->flags & DMU_VERIFY) memset(buf, 0, chunk_size); t.start = current_kernel_time(); rc = kpios_dmu_read(run_args, obj.os, obj.obj, offset, chunk_size, buf); t.stop = current_kernel_time(); t.delta = timespec_sub(t.stop, t.start); if (rc) { kpios_print(run_args->file, "IO error while doing " "dmu_read(): %d\n", rc); break; } /* Trivial data verification, expensive! */ if (run_args->flags & DMU_VERIFY) { for (i = 0; i < chunk_size; i++) { if (buf[i] != 'z') { kpios_print(run_args->file, "IO verify error: %d/%d/%d\n", (int)obj.obj, (int)offset, (int)chunk_size); break; } } } mutex_enter(&thr->lock); thr->stats.rd_data += chunk_size; thr->stats.rd_chunks++; thr->stats.rd_time.delta = timespec_add( thr->stats.rd_time.delta, t.delta); mutex_exit(&thr->lock); mutex_enter(®ion->lock); region->stats.rd_data += chunk_size; region->stats.rd_chunks++; region->stats.rd_time.delta = timespec_add( region->stats.rd_time.delta, t.delta); /* First time region was accessed */ if (region->init_offset == offset) region->stats.rd_time.start = t.start; mutex_exit(®ion->lock); } mutex_enter(&run_args->lock_ctl); run_args->threads_done++; mutex_exit(&run_args->lock_ctl); mutex_enter(&thr->lock); thr->rc = rc; thr->stats.rd_time.stop = current_kernel_time(); mutex_exit(&thr->lock); wake_up(&run_args->waitq); out: vmem_free(buf, chunk_size); do_exit(0); return rc; /* Unreachable, due to do_exit() */ } static int kpios_thread_done(run_args_t *run_args) { ASSERT(run_args->threads_done <= run_args->thread_count); return (run_args->threads_done == run_args->thread_count); } static int kpios_threads_run(run_args_t *run_args) { struct task_struct *tsk, **tsks; thread_data_t *thr = NULL; kpios_time_t *tt = &(run_args->stats.total_time); kpios_time_t *tw = &(run_args->stats.wr_time); kpios_time_t *tr = &(run_args->stats.rd_time); int i, rc = 0, tc = run_args->thread_count; DEFINE_WAIT(wait); kpios_upcall(run_args->pre, PHASE_PRE, run_args, 0); tsks = kmem_zalloc(sizeof(struct task_struct *) * tc, KM_SLEEP); if (tsks == NULL) { rc = -ENOMEM; goto cleanup2; } run_args->threads = kmem_zalloc(sizeof(thread_data_t *) * tc, KM_SLEEP); if (run_args->threads == NULL) { rc = -ENOMEM; goto cleanup; } init_waitqueue_head(&run_args->waitq); run_args->threads_done = 0; /* Create all the needed threads which will sleep until awoken */ for (i = 0; i < tc; i++) { thr = kmem_zalloc(sizeof(thread_data_t), KM_SLEEP); if (thr == NULL) { rc = -ENOMEM; goto taskerr; } thr->thread_no = i; thr->run_args = run_args; thr->rc = 0; mutex_init(&thr->lock, NULL, MUTEX_DEFAULT, NULL); run_args->threads[i] = thr; tsk = kthread_create(kpios_thread_main, (void *)thr, "%s/%d", "kpios_io", i); if (IS_ERR(tsk)) { rc = -EINVAL; goto taskerr; } tsks[i] = tsk; } tt->start = current_kernel_time(); /* Wake up all threads for write phase */ kpios_upcall(run_args->pre, PHASE_WRITE, run_args, 0); for (i = 0; i < tc; i++) wake_up_process(tsks[i]); /* Wait for write phase to complete */ tw->start = current_kernel_time(); wait_event(run_args->waitq, kpios_thread_done(run_args)); tw->stop = current_kernel_time(); for (i = 0; i < tc; i++) { thr = run_args->threads[i]; mutex_enter(&thr->lock); if (!rc && thr->rc) rc = thr->rc; run_args->stats.wr_data += thr->stats.wr_data; run_args->stats.wr_chunks += thr->stats.wr_chunks; mutex_exit(&thr->lock); } kpios_upcall(run_args->post, PHASE_WRITE, run_args, rc); if (rc) { /* Wake up all threads and tell them to exit */ for (i = 0; i < tc; i++) { mutex_enter(&thr->lock); thr->rc = rc; mutex_exit(&thr->lock); wake_up_process(tsks[i]); } goto out; } mutex_enter(&run_args->lock_ctl); ASSERT(run_args->threads_done == run_args->thread_count); run_args->threads_done = 0; mutex_exit(&run_args->lock_ctl); /* Wake up all threads for read phase */ kpios_upcall(run_args->pre, PHASE_READ, run_args, 0); for (i = 0; i < tc; i++) wake_up_process(tsks[i]); /* Wait for read phase to complete */ tr->start = current_kernel_time(); wait_event(run_args->waitq, kpios_thread_done(run_args)); tr->stop = current_kernel_time(); for (i = 0; i < tc; i++) { thr = run_args->threads[i]; mutex_enter(&thr->lock); if (!rc && thr->rc) rc = thr->rc; run_args->stats.rd_data += thr->stats.rd_data; run_args->stats.rd_chunks += thr->stats.rd_chunks; mutex_exit(&thr->lock); } kpios_upcall(run_args->post, PHASE_READ, run_args, rc); out: tt->stop = current_kernel_time(); tt->delta = timespec_sub(tt->stop, tt->start); tw->delta = timespec_sub(tw->stop, tw->start); tr->delta = timespec_sub(tr->stop, tr->start); cleanup: kmem_free(tsks, sizeof(struct task_struct *) * tc); cleanup2: kpios_upcall(run_args->post, PHASE_POST, run_args, rc); /* Returns first encountered thread error (if any) */ return rc; taskerr: /* Destroy all threads that were created successfully */ for (i = 0; i < tc; i++) if (tsks[i] != NULL) (void) kthread_stop(tsks[i]); goto cleanup; } static int kpios_do_one_run(struct file *file, kpios_cmd_t *kcmd, int data_size, void *data) { run_args_t *run_args; kpios_stats_t *stats = (kpios_stats_t *)data; int i, n, m, size, rc; if ((!kcmd->cmd_chunk_size) || (!kcmd->cmd_region_size) || (!kcmd->cmd_thread_count) || (!kcmd->cmd_region_count)) { kpios_print(file, "Invalid chunk_size, region_size, " "thread_count, or region_count, %d\n", -EINVAL); return -EINVAL; } if (!(kcmd->cmd_flags & DMU_WRITE) || !(kcmd->cmd_flags & DMU_READ)) { kpios_print(file, "Invalid flags, minimally DMU_WRITE " "and DMU_READ must be set, %d\n", -EINVAL); return -EINVAL; } if ((kcmd->cmd_flags & (DMU_WRITE_ZC | DMU_READ_ZC)) && (kcmd->cmd_flags & DMU_VERIFY)) { kpios_print(file, "Invalid flags, DMU_*_ZC incompatible " "with DMU_VERIFY, used for performance analysis " "only, %d\n", -EINVAL); return -EINVAL; } /* Opaque data on return contains structs of the following form: * * kpios_stat_t stats[]; * stats[0] = run_args->stats; * stats[1-N] = threads[N]->stats; * stats[N+1-M] = regions[M]->stats; * * Where N is the number of threads, and M is the number of regions. */ size = (sizeof(kpios_stats_t) + (kcmd->cmd_thread_count * sizeof(kpios_stats_t)) + (kcmd->cmd_region_count * sizeof(kpios_stats_t))); if (data_size < size) { kpios_print(file, "Invalid size, command data buffer " "size too small, (%d < %d)\n", data_size, size); return -ENOSPC; } rc = kpios_setup_run(&run_args, kcmd, file); if (rc) return rc; rc = kpios_threads_run(run_args); kpios_remove_objects(run_args); if (rc) goto cleanup; if (stats) { n = 1; m = 1 + kcmd->cmd_thread_count; stats[0] = run_args->stats; for (i = 0; i < kcmd->cmd_thread_count; i++) stats[n+i] = run_args->threads[i]->stats; for (i = 0; i < kcmd->cmd_region_count; i++) stats[m+i] = run_args->regions[i].stats; } cleanup: kpios_cleanup_run(run_args); return rc; } static int kpios_open(struct inode *inode, struct file *file) { unsigned int minor = iminor(inode); kpios_info_t *info; if (minor >= KPIOS_MINORS) return -ENXIO; info = (kpios_info_t *)kmem_alloc(sizeof(*info), KM_SLEEP); if (info == NULL) return -ENOMEM; spin_lock_init(&info->info_lock); info->info_size = KPIOS_INFO_BUFFER_SIZE; info->info_buffer = (char *)vmem_alloc(KPIOS_INFO_BUFFER_SIZE,KM_SLEEP); if (info->info_buffer == NULL) { kmem_free(info, sizeof(*info)); return -ENOMEM; } info->info_head = info->info_buffer; file->private_data = (void *)info; return 0; } static int kpios_release(struct inode *inode, struct file *file) { unsigned int minor = iminor(inode); kpios_info_t *info = (kpios_info_t *)file->private_data; if (minor >= KPIOS_MINORS) return -ENXIO; ASSERT(info); ASSERT(info->info_buffer); vmem_free(info->info_buffer, KPIOS_INFO_BUFFER_SIZE); kmem_free(info, sizeof(*info)); return 0; } static int kpios_buffer_clear(struct file *file, kpios_cfg_t *kcfg, unsigned long arg) { kpios_info_t *info = (kpios_info_t *)file->private_data; ASSERT(info); ASSERT(info->info_buffer); spin_lock(&info->info_lock); memset(info->info_buffer, 0, info->info_size); info->info_head = info->info_buffer; spin_unlock(&info->info_lock); return 0; } static int kpios_buffer_size(struct file *file, kpios_cfg_t *kcfg, unsigned long arg) { kpios_info_t *info = (kpios_info_t *)file->private_data; char *buf; int min, size, rc = 0; ASSERT(info); ASSERT(info->info_buffer); spin_lock(&info->info_lock); if (kcfg->cfg_arg1 > 0) { size = kcfg->cfg_arg1; buf = (char *)vmem_alloc(size, KM_SLEEP); if (buf == NULL) { rc = -ENOMEM; goto out; } /* Zero fill and truncate contents when coping buffer */ min = ((size < info->info_size) ? size : info->info_size); memset(buf, 0, size); memcpy(buf, info->info_buffer, min); vmem_free(info->info_buffer, info->info_size); info->info_size = size; info->info_buffer = buf; info->info_head = info->info_buffer; } kcfg->cfg_rc1 = info->info_size; if (copy_to_user((struct kpios_cfg_t __user *)arg, kcfg, sizeof(*kcfg))) rc = -EFAULT; out: spin_unlock(&info->info_lock); return rc; } static int kpios_ioctl_cfg(struct file *file, unsigned long arg) { kpios_cfg_t kcfg; int rc = 0; if (copy_from_user(&kcfg, (kpios_cfg_t *)arg, sizeof(kcfg))) return -EFAULT; if (kcfg.cfg_magic != KPIOS_CFG_MAGIC) { kpios_print(file, "Bad config magic 0x%x != 0x%x\n", kcfg.cfg_magic, KPIOS_CFG_MAGIC); return -EINVAL; } switch (kcfg.cfg_cmd) { case KPIOS_CFG_BUFFER_CLEAR: /* cfg_arg1 - Unused * cfg_rc1 - Unused */ rc = kpios_buffer_clear(file, &kcfg, arg); break; case KPIOS_CFG_BUFFER_SIZE: /* cfg_arg1 - 0 - query size; >0 resize * cfg_rc1 - Set to current buffer size */ rc = kpios_buffer_size(file, &kcfg, arg); break; default: kpios_print(file, "Bad config command %d\n", kcfg.cfg_cmd); rc = -EINVAL; break; } return rc; } static int kpios_ioctl_cmd(struct file *file, unsigned long arg) { kpios_cmd_t kcmd; int rc = -EINVAL; void *data = NULL; rc = copy_from_user(&kcmd, (kpios_cfg_t *)arg, sizeof(kcmd)); if (rc) { kpios_print(file, "Unable to copy command structure " "from user to kernel memory, %d\n", rc); return -EFAULT; } if (kcmd.cmd_magic != KPIOS_CMD_MAGIC) { kpios_print(file, "Bad command magic 0x%x != 0x%x\n", kcmd.cmd_magic, KPIOS_CFG_MAGIC); return -EINVAL; } /* Allocate memory for any opaque data the caller needed to pass on */ if (kcmd.cmd_data_size > 0) { data = (void *)vmem_alloc(kcmd.cmd_data_size, KM_SLEEP); if (data == NULL) { kpios_print(file, "Unable to vmem_alloc() %ld " "bytes for data buffer\n", (long)kcmd.cmd_data_size); return -ENOMEM; } rc = copy_from_user(data, (void *)(arg + offsetof(kpios_cmd_t, cmd_data_str)), kcmd.cmd_data_size); if (rc) { kpios_print(file, "Unable to copy data buffer " "from user to kernel memory, %d\n", rc); vmem_free(data, kcmd.cmd_data_size); return -EFAULT; } } rc = kpios_do_one_run(file, &kcmd, kcmd.cmd_data_size, data); if (data != NULL) { /* If the test failed do not print out the stats */ if (rc) goto cleanup; rc = copy_to_user((void *)(arg + offsetof(kpios_cmd_t, cmd_data_str)), data, kcmd.cmd_data_size); if (rc) { kpios_print(file, "Unable to copy data buffer " "from kernel to user memory, %d\n", rc); rc = -EFAULT; } cleanup: vmem_free(data, kcmd.cmd_data_size); } return rc; } static int kpios_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg) { unsigned int minor = iminor(file->f_dentry->d_inode); int rc = 0; /* Ignore tty ioctls */ if ((cmd & 0xffffff00) == ((int)'T') << 8) return -ENOTTY; if (minor >= KPIOS_MINORS) return -ENXIO; switch (cmd) { case KPIOS_CFG: rc = kpios_ioctl_cfg(file, arg); break; case KPIOS_CMD: rc = kpios_ioctl_cmd(file, arg); break; default: kpios_print(file, "Bad ioctl command %d\n", cmd); rc = -EINVAL; break; } return rc; } /* I'm not sure why you would want to write in to this buffer from * user space since its principle use is to pass test status info * back to the user space, but I don't see any reason to prevent it. */ static ssize_t kpios_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { unsigned int minor = iminor(file->f_dentry->d_inode); kpios_info_t *info = (kpios_info_t *)file->private_data; int rc = 0; if (minor >= KPIOS_MINORS) return -ENXIO; ASSERT(info); ASSERT(info->info_buffer); spin_lock(&info->info_lock); /* Write beyond EOF */ if (*ppos >= info->info_size) { rc = -EFBIG; goto out; } /* Resize count if beyond EOF */ if (*ppos + count > info->info_size) count = info->info_size - *ppos; if (copy_from_user(info->info_buffer, buf, count)) { rc = -EFAULT; goto out; } *ppos += count; rc = count; out: spin_unlock(&info->info_lock); return rc; } static ssize_t kpios_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { unsigned int minor = iminor(file->f_dentry->d_inode); kpios_info_t *info = (kpios_info_t *)file->private_data; int rc = 0; if (minor >= KPIOS_MINORS) return -ENXIO; ASSERT(info); ASSERT(info->info_buffer); spin_lock(&info->info_lock); /* Read beyond EOF */ if (*ppos >= info->info_size) goto out; /* Resize count if beyond EOF */ if (*ppos + count > info->info_size) count = info->info_size - *ppos; if (copy_to_user(buf, info->info_buffer + *ppos, count)) { rc = -EFAULT; goto out; } *ppos += count; rc = count; out: spin_unlock(&info->info_lock); return rc; } static loff_t kpios_seek(struct file *file, loff_t offset, int origin) { unsigned int minor = iminor(file->f_dentry->d_inode); kpios_info_t *info = (kpios_info_t *)file->private_data; int rc = -EINVAL; if (minor >= KPIOS_MINORS) return -ENXIO; ASSERT(info); ASSERT(info->info_buffer); spin_lock(&info->info_lock); switch (origin) { case 0: /* SEEK_SET - No-op just do it */ break; case 1: /* SEEK_CUR - Seek from current */ offset = file->f_pos + offset; break; case 2: /* SEEK_END - Seek from end */ offset = info->info_size + offset; break; } if (offset >= 0) { file->f_pos = offset; file->f_version = 0; rc = offset; } spin_unlock(&info->info_lock); return rc; } static struct file_operations kpios_fops = { .owner = THIS_MODULE, .open = kpios_open, .release = kpios_release, .ioctl = kpios_ioctl, .read = kpios_read, .write = kpios_write, .llseek = kpios_seek, }; static struct cdev kpios_cdev = { .owner = THIS_MODULE, .kobj = { .name = "kpios", }, }; static int __init kpios_init(void) { dev_t dev; int rc; dev = MKDEV(KPIOS_MAJOR, 0); if ((rc = register_chrdev_region(dev, KPIOS_MINORS, "kpios"))) goto error; /* Support for registering a character driver */ cdev_init(&kpios_cdev, &kpios_fops); if ((rc = cdev_add(&kpios_cdev, dev, KPIOS_MINORS))) { printk(KERN_ERR "kpios: Error adding cdev, %d\n", rc); kobject_put(&kpios_cdev.kobj); unregister_chrdev_region(dev, KPIOS_MINORS); goto error; } /* Support for udev make driver info available in sysfs */ kpios_class = class_create(THIS_MODULE, "kpios"); if (IS_ERR(kpios_class)) { rc = PTR_ERR(kpios_class); printk(KERN_ERR "kpios: Error creating kpios class, %d\n", rc); cdev_del(&kpios_cdev); unregister_chrdev_region(dev, KPIOS_MINORS); goto error; } class_device_create(kpios_class, NULL, dev, NULL, "kpios"); printk(KERN_INFO "kpios: Loaded Kernel PIOS Tests v%s\n", VERSION); return 0; error: printk(KERN_ERR "kpios: Error registering kpios device, %d\n", rc); return rc; } static void kpios_fini(void) { dev_t dev = MKDEV(KPIOS_MAJOR, 0); class_device_destroy(kpios_class, dev); class_destroy(kpios_class); cdev_del(&kpios_cdev); unregister_chrdev_region(dev, KPIOS_MINORS); printk(KERN_INFO "kpios: Unloaded Kernel PIOS Tests\n"); return; } module_init(kpios_init); module_exit(kpios_fini); MODULE_AUTHOR("LLNL / Sun"); MODULE_DESCRIPTION("Kernel PIOS implementation"); MODULE_LICENSE("GPL");