commit 34dc7c2f2553220ebc6e29ca195fb6d57155f95f Author: Brian Behlendorf Date: Thu Nov 20 12:01:55 2008 -0800 Initial Linux ZFS GIT Repo diff --git a/AUTHORS b/AUTHORS new file mode 100644 index 0000000000..04ebc204f6 --- /dev/null +++ b/AUTHORS @@ -0,0 +1,4 @@ +Brian Behlendorf , +Herb Wartens , +Jim Garlick , +Ricardo M. Correia diff --git a/ChangeLog b/ChangeLog new file mode 100644 index 0000000000..8a74328d6c --- /dev/null +++ b/ChangeLog @@ -0,0 +1,114 @@ +2008-11-19 Brian Behlendorf + + * : Tag zfs-0.4.0 + + * : ZFS project migrated from Subversion which leveraged a + quilt based patch stack to Git and a TopGit managed patch + stack. The new method treats all patches as Git branches + which can be more easily shared for distributed development. + Consult the top level GIT file for detailed information on + how to properly develop for this package using Git+TopGit. + +2008-11-12 Brian Behlendorf + + * : Tag zfs-0.3.4 + + * zfs-07-create-dev-zfs.patch: + Ricardo M. Correia + - Make libzfs create /dev/zfs if it doesn't exist. + + * zfs-05-check-zvol-size.patch: + Ricardo M. Correia + - Properly check zvol size under Linux. + + * zfs-04-no-openat-fdopendir.patch: + Ricardo M. Correia + - Do not use openat() and fdopendir() since they are not available + on older systems. + + * zfs-03-fix-bio-sync.patch: + Ricardo M. Correia + - Fix memory corruption in RHEL4 due to synchronous IO becoming + asynchronous. + +2008-11-06 Brian Behlendorf + + * zfs-02-zpios-fix-stuck-thread-memleak.patch: + Ricardo M. Correia + - Fix stuck threads and memory leaks when errors occur while writing. + + * zfs-01-zpios-arg-corruption.patch: + Ricardo M. Correia + - Fix zpios cmd line argument corruption problem. + + * zfs-00-minor-fixes.patch: + Ricardo M. Correia + - Minor build system improvements + - Minor script improvements + - Create a full copy and not a link tree with quilt + - KPIOS_MAJOR changed from 231 to 232 + - BIO_RW_BARRIER flag removed from IO request + +2008-06-30 Brian Behlendorf + + * : Tag zfs-0.3.3 + + * : Minor script updates and tweaks to be compatible with + the latest version of the SPL. + +2008-06-13 Brian Behlendorf + + * vdev_disk.diff: Replace vdev_disk implementation which was + based on the kmalloc'ed logical address space with a version + which works with vmalloc'ed memory in the virtual address space. + This was done to support the new SPL slab implementation which + is based on virtual addresses to avoid the need for contigeously + allocated memory. + +2008-06-05 Brian Behlendorf + + * arc-vm-integration.diff: Reduce maximum default arc memory + usage to 1/4 of total system memory. Because all the bulk data + is still allocated on the slab memory fragmentation is a serious + concern. To address this in the short term we simply need to + leave lots of free memory. + + * fix-stack.diff: First step towards reducing stack usage so + we can run the full ZFS stack using a stock kernel. + +2008-06-04 Brian Behlendorf + + * : Tag zfs-0.3.2 + + * : Extensive improvements to the build system to detect kernel + API changes so we can flexibly build with a wider range of kernel + versions. The code has now been testing with the 2.6.18-32chaos + and 2.6.25.3-18.fc9 kernels, however we should also be compatible + with other kernels in the range of 2.6.18-2.6.25. The only + remaining issue preventing us from running with a stock + kernel is ZFS stack usage. + +2008-05-21 Brian Behlendorf + + * : Tag zfs-0.3.1 + + * : License headers including URCL added for release. + +2008-05-21 Brian Behlendorf + + * : Tag zfs-0.3.0 + + * configure.ac: Improved autotools support and configurable debug. + +2008-05-15 Brian Behlendorf + + * : Updating original ZFS sources to build 89 which + includes the new write throttling changes plus support + for using ZFS as your root device. Neither of which + will work exactly right without some more work but this + gets us much closers to the latest source. + + +2008-02-28 Brian Behlendorf + + * : First attempt based on SPL module and zfs-lustre sources diff --git a/DISCLAIMER b/DISCLAIMER new file mode 100644 index 0000000000..86778c593f --- /dev/null +++ b/DISCLAIMER @@ -0,0 +1,22 @@ +This notice is required to be provided under our contract with the +U.S. Department of Energy (DOE). This work was produced at the +Lawrence Livermore National Laboratory under Contract with the DOE. + +Neither the United States Government nor the Lawrence Livermore National +Security, LLC. nor any of their employees, makes any warranty, express +or implied, or assumes any liability or responsibility for the accuracy, +completeness, or usefulness of any information, apparatus, product, +or process disclosed, or represents that its use would not infringe +privately-owned rights. + +Also, reference herein to any specific commercial products, process, +or services by trade name, trademark, manufacturer or otherwise does +not necessarily constitute or imply its endorsement, recommendation, +or favoring by the United States Government or the Lawrence Livermore +National Security, LLC. The views and opinions of authors expressed +herein do not necessarily state or reflect those of the United States +Government or the Lawrence Livermore National Security, LLC., and +shall not be used for advertising or product endorsement purposes. + +The precise terms and conditions for copying, distribution and +modification are specified in the file "COPYING". diff --git a/GIT b/GIT new file mode 100644 index 0000000000..578e7316b4 --- /dev/null +++ b/GIT @@ -0,0 +1,162 @@ +=========================== WHY USE GIT+TOPGIT? ========================== + +Three major concerns were on our mind when setting up this project. + + o First we needed to structure the project in such a way that it would be + easy to rebase all of our changes on the latest official ZFS release + from Sun. We absolutely need to be able to benefit from the upstream + improvements and not get locked in to an old version of the code base. + + o Secondly, we wanted to be able to easily manage our changes in terms + of a patch stack. This allows us to easily isolate specific changes + and push them upstream for inclusion. It also allows us to easily + update or drop specific changes based on what occurs upstream. + + o Thirdly we needed our DVCS to be integrated with the management of this + patch stack. We have tried other methods in the past such as SVN+Quilt + but have found managing the patch stack becomes cumbersome. By using + Git+TopGit to more tightly integrate our patch stack in to the repo + we expect several benefits. One of the most important will be the + ability to easily work on the patch stack with a distributed developer + team, additionally the repo can track patch history, and we can utilize + Git to merge patches and resolve conflicts. + +TopGit is designed to specifically address these concerns by providing +tools to simplify the handling of large numbers of interdependent topic +branches. When using a TopGit aware repo every topic branch represents +a 'patch' and that branch references its dependent branches. The union +of all these branches is your final source base. + +========================= SETTING UP GIT+TOPGIT ========================== + +First off you need to install a Git package on your system. For my +purposes I have been working on a RHEL5 system with git version 1.5.4.5 +installed and it has been working well. You will also need to go get +the latest version of TopGit which likely is not packaged nicely so you +will need to build it from source. You can use Git to clone TopGit +from the official site here and your all set: + + > git clone http://repo.or.cz/w/topgit.git + > make + > make install # Default installs to $(HOME) + +========================== TOPGIT AND ZFS ================================ + +One you have Git and TopGit installed you will want to clone a copy of +the Linux ZFS repo. While this project does not yet have a public home +it hopefully will some day. In the meanwhile if you have VPN access to +LLNL you can clone the latest official repo here. Cloning a TopGit +controlled repo is very similar to cloning a normal Git repo, but you +need to remember to use 'tg remote' to populate all topic branches. + + > git clone http://eris.llnl.gov/git/zfs.git zfs + > cd zfs + > tg remote --populate origin + +Now that you have the Linux ZFS repo the first thing you will probably +want to do is have a look at all the topic branches. TopGit provides +a summary command which shows all the branches and a brief summary for +each branch obtained from the .topmsg files. + + > tg summary + 0 t/LAST [PATCH] LAST + t/feature-commit-cb [PATCH] zfs commit callbacks + t/fix-clock-wrap [PATCH] fix clock wrap + t/fix-dnode-cons [PATCH] fix dnode constructor + ... + +By convention all TopGit branches are prefixed with 't/', and the Linux +ZFS repo also introduces the convention that the top most development +branch be called 't/LAST". This provides a consistent label to be used +when you need to reference the branch which contains the union of all +topic branches. + +One thing you may also notice about the 'tg summary' command is it does +not show the branches in dependent order. While this project only expresses +a single dependency per branch TopGit implements dependencies as a DAC just +like Git. To see the dependencies you will need to use the --graphviz +option and pipe the result to dot for display. The following command while +long works fairly well for me. Longer term it would be nice to update this +option to use a preferred config options stored in the repo if they exist. + + > tg summary --graphviz | dot -Txlib -Nfontsize=8 -Eminlen=0.01 \ + -Grankdir=LR -Nheight=0.3 -Nwidth=2 -Nfixedsize=true + +========================= UPDATING A TOPIC BRANCH ======================== + +Updating a topic branch in TopGit is a pretty straight forward but there +are a few rules you need to be aware of. The basic process involves +checking out the relevant topic branch where the changes need to be made, +making the changes, committing the changes to the branch and then merging +those changes in to dependent branches. TopGit provides some tools to make +this pretty easy, although it may be a little sluggish. Here is an example: + + > git checkout t/feature-commit-cb # Checkout the proper branch + > ...update branch... # Update the branch + > git commit -a # Commit your changes + > git checkout t/LAST # Checkout the LAST branch + > tg update # Recursively merge in new branch + +Assuming you change does not introduce any conflicts your done. All branches +were dependent on your change will have had the changed merged in. If your +change introduced a conflict you will need to resolve the conflict and then +continue on with the update. + +========================== ADDING A TOPIC BRANCH ========================= + +Adding a topic branch in TopGit is a little more complicated. When adding +a new branch to the end of the patch graph things are pretty easy and TopGit +does all the work. However, I expect out common case to be adding patches +to the middle of the graph. TopGit will allow you to do this but you must +be careful to manually update the dependency information in the .topdeps +file. + + > git co t/existing-topic-branch # Checkout the branch to add after + > tg create t/new-topic-branch # Create a new topic branch + > ...update .topmsg... # Update the branch message + > ...create patch... # Update with your changes + > git commit -a # Commit your changes + > git co t/dependent-topic-branch # Checkout dependent branch + > ...update .topdeps... # Manually update dependencies + > git commit -a # Commit your changes + > tg update # TopGit update + > git checkout t/LAST # Checkout the LAST branch + > tg update # Recursively merge in new branch + +========================= REMOVING A TOPIC BRANCH ======================== + +Removing a topic branch in TopGit is also currently not very easy. To remove +a dependent branch the basic process is to commit a patch which reverts all +changes on the branch. Then that reversion must be merged in to all dependent +branches, the dependencies manually updated and finally the branch removed. +If the branch is not empty you will not be able to remove it. + + > git co t/del-topic-branch # Checkout the branch to delete + > tg patch | patch -R -p1 # Revert all branch changes + > git commit -a # Commit your changes + > git checkout t/LAST # Checkout the LAST branch + > tg update # Recursively merge revert + > git co t/dependent-topic-branch # Checkout dependent branch + > ...update .topdeps... # Manually update dependencies + > git commit -a # Commit your changes + > tg delete t/del-topic-branch # Delete empty topic branch + +============================ TOPGIT TODO ================================= + +TopGit is still a young package which seems to be under active development +by its author. It provides the minimum set of commands needed but there +are clearly areas which simply have not yet been implemented. My short +list of features includes: + + o 'tg summary --deps', option to display a text version of the topic + branch dependency DAC. + + o 'tg depend list', list all topic branch dependencies. + + o 'tg depend delete', cleanly remove a topic branch dependency. + + o 'tg create', cleanly insert a topic branch in the middle + of the graph and properly take care updating all dependencies. + + o 'tg delete', cleanly delete a topic branch in the middle + of the graph and properly take care updating all dependencies. diff --git a/META b/META new file mode 100644 index 0000000000..9b66d312f0 --- /dev/null +++ b/META @@ -0,0 +1,6 @@ +Meta: 1 +Name: zfs +Branch: 1.0 +Version: 0.3.4 +Release: 1 +Release-Tags: relext diff --git a/Makefile.am b/Makefile.am new file mode 100644 index 0000000000..39f3fd0848 --- /dev/null +++ b/Makefile.am @@ -0,0 +1,25 @@ +AUTOMAKE_OPTIONS = foreign dist-zip + +SUBDIRS = doc scripts $(BUILDDIR) +CONFIG_CLEAN_FILES = aclocal.m4 config.guess config.sub +CONFIG_CLEAN_FILES += depcomp missing mkinstalldirs +EXTRA_DIST = autogen.sh + +.PHONY: quilt +quilt: .quilt-$(BUILDDIR) +autogen: .autogen-$(BUILDDIR) +config: .config-$(BUILDDIR) +.quilt-$(BUILDDIR): + ./scripts/quilt.sh -p $(NAME) -b $(BUILDDIR) -s $(SERIESFILE) -d $(PATCHDIR) + echo $(BUILDDIR) >$@ + +unquilt: + rm -rf $(BUILDDIR) + rm -f .quilt-$(BUILDDIR) + +clean-generic: + +distclean: unquilt + +rpms: dist Makefile + rpmbuild -ta $(distdir).tar.gz diff --git a/README b/README new file mode 100644 index 0000000000..b17c1d83fe --- /dev/null +++ b/README @@ -0,0 +1,74 @@ +============================ ZFS KERNEL BUILD ============================ + +1) Build the SPL (Solaris Porting Layer) module which is designed to + provide many Solaris APIs in the Linux kernel which are needed + by ZFS. To build the SPL: + + tar -xzf spl-x.y.z.tgz + cd spl-x.y.z + ./configure --with-linux= + make + make check + +2) Build ZFS, this port is based on build 89 of ZFS from OpenSolaris. + You will need to have both the kernel and SPL source available. + To build ZFS for use as a Linux kernel module (default): + + tar -xzf zfs-x.y.z.tgz + cd zfs-x.y.z + ./configure --with-linux= \ + --with-spl= + make + make check + +========================= ZFS USER LIBRARY BUILD ========================= + +1) Build ZFS, this port is based on build 89 of ZFS from OpenSolaris. + To build ZFS as a userspace library: + + tar -xzf zfs-x.y.z.tgz + cd zfs-x.y.z + ./configure --zfsconfig=user + make + make check + +============================ ZFS LUSTRE BUILD ============================ + +1) Build the SPL (Solaris Porting Layer) module which is designed to + provide many Solaris APIs in the Linux kernel which are needed + by ZFS. To build the SPL: + + tar -xzf spl-x.y.z.tgz + cd spl-x.y.z + ./configure --with-linux= + make + make check + +2) Build ZFS, this port is based on build 89 of ZFS from OpenSolaris. + To build ZFS as a userspace library for use by a Lustre filesystem: + + tar -xzf zfs-x.y.z.tgz + cd zfs-x.y.z + ./configure --zfsconfig=lustre \ + --with-linux= \ + --with-spl= + make + make check + +3) Provided is an in-kernel test application called kpios which can be + used to simulate a Lustre IO load. It may be used as a stress test + or as a performance to measure your configuration. To simplify testing + there are scripts provided in the scripts/ directory. A single test + can be run as follows: + + WARNING: You MUST update DEVICES in the create-zpool.sh script + to reference the devices you wish to use. + + cd scripts + ./load-zfs.sh # Load the ZFS/SPL module stack + ./create-zpool.sh # Modify DEVICES to list your zpool devices + ./zpios.sh # Modify for your particular kpios test + ./unload-zfs.sh # Unload the ZFS/SPL module stack + +Enjoy, +Brian Behlendorf diff --git a/TODO b/TODO new file mode 100644 index 0000000000..0d9310dfde --- /dev/null +++ b/TODO @@ -0,0 +1,30 @@ +* We may need a libefi replacement. It appears libefi is used + to determine if the device passed to zpool is a 'whole device' + or just a partition of a device. In the short term I think we + can simply treat everything as a partition and be alright. + +* We also do not have support for getting Solaris style device + ids which is done when a zpool is setup. We may or may not + be able to live without this, the jury is still out. + +----------------------------------------------------------------------- + +* Port zvol (ZFS volume interface). + +* Port zpl (ZFS posix interface). + +* Port lustre fsfilt interface to use DMU. + +* Andreas issue #1: + "the maximum allocation DMU "blocksize" was 128kB and it would be better + to be able to get 1MB contiguous allocations for best performance" + +* Andreas issue #2: + "there would need to be some work done to allow multiple operations to + be atomic. This is needed by Lustre for object creation + LAST_ID + updates, unlink + llog updates, etc. Conceptually this isn't very + much work for a phase tree, but I've never looked at the ZFS code." + +* Design and implement mechanism for viewing and modifying OST content + from user space (by manipulating datasets/objects), possibly + by implementing scaled down file system interface. diff --git a/autoconf/Makefile.am b/autoconf/Makefile.am new file mode 100644 index 0000000000..688e8fda44 --- /dev/null +++ b/autoconf/Makefile.am @@ -0,0 +1 @@ +EXTRA_DIST = zfs-build.m4 diff --git a/autoconf/zfs-build.m4 b/autoconf/zfs-build.m4 new file mode 100644 index 0000000000..773d42832f --- /dev/null +++ b/autoconf/zfs-build.m4 @@ -0,0 +1,378 @@ +AC_DEFUN([ZFS_AC_CONFIG], [ + + AC_ARG_WITH([zfs-config], + AS_HELP_STRING([--with-config=CONFIG], + [Config file 'kernel|user|lustre']), + [zfsconfig="$withval"]) + + AC_MSG_CHECKING([zfs config file]) + if test -z "$zfsconfig" || test ! -r configs/$zfsconfig; then + AC_MSG_RESULT([no]) + AC_MSG_ERROR([ + *** Please specify one of the valid config files located + *** in ./configs/ with the '--with-zfs-config=CONFIG' option]) + fi + + AC_MSG_RESULT([$zfsconfig]); + . ./configs/$zfsconfig + + TOPDIR=`/bin/pwd` + ZFSDIR=${TOPDIR}/$BUILDDIR + LIBDIR=$ZFSDIR/lib + CMDDIR=$ZFSDIR/zcmd + + AC_SUBST(TOPDIR) + AC_SUBST(ZFSDIR) + AC_SUBST(LIBDIR) + AC_SUBST(CMDDIR) +]) + +AC_DEFUN([ZFS_AC_KERNEL], [ + ver=`uname -r` + + AC_ARG_WITH([linux], + AS_HELP_STRING([--with-linux=PATH], + [Path to kernel source]), + [kernelsrc="$withval"; kernelbuild="$withval"]) + + AC_ARG_WITH(linux-obj, + AS_HELP_STRING([--with-linux-obj=PATH], + [Path to kernel build objects]), + [kernelbuild="$withval"]) + + AC_MSG_CHECKING([kernel source directory]) + if test -z "$kernelsrc"; then + kernelbuild= + sourcelink=/lib/modules/${ver}/source + buildlink=/lib/modules/${ver}/build + + if test -e $sourcelink; then + kernelsrc=`(cd $sourcelink; /bin/pwd)` + fi + if test -e $buildlink; then + kernelbuild=`(cd $buildlink; /bin/pwd)` + fi + if test -z "$kernelsrc"; then + kernelsrc=$kernelbuild + fi + if test -z "$kernelsrc" -o -z "$kernelbuild"; then + AC_MSG_RESULT([Not found]) + AC_MSG_ERROR([ + *** Please specify the location of the kernel source + *** with the '--with-linux=PATH' option]) + fi + fi + + AC_MSG_RESULT([$kernelsrc]) + AC_MSG_CHECKING([kernel build directory]) + AC_MSG_RESULT([$kernelbuild]) + + AC_MSG_CHECKING([kernel source version]) + if test -r $kernelbuild/include/linux/version.h && + fgrep -q UTS_RELEASE $kernelbuild/include/linux/version.h; then + + kernsrcver=`(echo "#include "; + echo "kernsrcver=UTS_RELEASE") | + cpp -I $kernelbuild/include | + grep "^kernsrcver=" | cut -d \" -f 2` + + elif test -r $kernelbuild/include/linux/utsrelease.h && + fgrep -q UTS_RELEASE $kernelbuild/include/linux/utsrelease.h; then + + kernsrcver=`(echo "#include "; + echo "kernsrcver=UTS_RELEASE") | + cpp -I $kernelbuild/include | + grep "^kernsrcver=" | cut -d \" -f 2` + fi + + if test -z "$kernsrcver"; then + AC_MSG_RESULT([Not found]) + AC_MSG_ERROR([ + *** Cannot determine the version of the linux kernel source. + *** Please prepare the kernel before running this script]) + fi + + AC_MSG_RESULT([$kernsrcver]) + + kmoduledir=${INSTALL_MOD_PATH}/lib/modules/$kernsrcver + LINUX=${kernelsrc} + LINUX_OBJ=${kernelbuild} + + AC_SUBST(LINUX) + AC_SUBST(LINUX_OBJ) + AC_SUBST(kmoduledir) +]) + +AC_DEFUN([ZFS_AC_SPL], [ + + AC_ARG_WITH([spl], + AS_HELP_STRING([--with-spl=PATH], + [Path to spl source]), + [splsrc="$withval"; splbuild="$withval"]) + + AC_ARG_WITH([spl-obj], + AS_HELP_STRING([--with-spl-obj=PATH], + [Path to spl build objects]), + [splbuild="$withval"]) + + + AC_MSG_CHECKING([spl source directory]) + if test -z "$splsrc"; then + splbuild= + sourcelink=/tmp/`whoami`/spl + buildlink=/tmp/`whoami`/spl + + if test -e $sourcelink; then + splsrc=`(cd $sourcelink; /bin/pwd)` + fi + if test -e $buildlink; then + splbuild=`(cd $buildlink; /bin/pwd)` + fi + if test -z "$splsrc"; then + splsrc=$splbuild + fi + fi + + if test -z "$splsrc" -o -z "$splbuild"; then + sourcelink=/lib/modules/${ver}/source + buildlink=/lib/modules/${ver}/build + + if test -e $sourcelink; then + splsrc=`(cd $sourcelink; /bin/pwd)` + fi + if test -e $buildlink; then + splbuild=`(cd $buildlink; /bin/pwd)` + fi + if test -z "$splsrc"; then + splsrc=$splbuild + fi + if test -z "$splsrc" -o -z "$splbuild"; then + AC_MSG_RESULT([Not found]) + AC_MSG_ERROR([ + *** Please specify the location of the spl source + *** with the '--with-spl=PATH' option]) + fi + fi + + AC_MSG_RESULT([$splsrc]) + AC_MSG_CHECKING([spl build directory]) + AC_MSG_RESULT([$splbuild]) + + AC_MSG_CHECKING([spl source version]) + if test -r $splbuild/spl_config.h && + fgrep -q VERSION $splbuild/spl_config.h; then + + splsrcver=`(echo "#include "; + echo "splsrcver=VERSION") | + cpp -I $splbuild | + grep "^splsrcver=" | cut -d \" -f 2` + fi + + if test -z "$splsrcver"; then + AC_MSG_RESULT([Not found]) + AC_MSG_ERROR([ + *** Cannot determine the version of the spl source. + *** Please prepare the spl source before running this script]) + fi + + AC_MSG_RESULT([$splsrcver]) + + AC_MSG_CHECKING([spl Module.symvers]) + if test -r $splbuild/modules/Module.symvers; then + splsymvers=$splbuild/modules/Module.symvers + elif test -r $kernelbuild/Module.symvers; then + splsymvers=$kernelbuild/Module.symvers + fi + + if test -z "$splsymvers"; then + AC_MSG_RESULT([Not found]) + AC_MSG_ERROR([ + *** Cannot find extra Module.symvers in the spl source. + *** Please prepare the spl source before running this script]) + fi + + AC_MSG_RESULT([$splsymvers]) + AC_SUBST(splsrc) + AC_SUBST(splsymvers) +]) + +AC_DEFUN([ZFS_AC_LICENSE], [ + AC_MSG_CHECKING([license]) + AC_MSG_RESULT([CDDL]) +dnl # AC_DEFINE([HAVE_GPL_ONLY_SYMBOLS], [1], +dnl # [Define to 1 if module is licensed under the GPL]) +]) + +AC_DEFUN([ZFS_AC_DEBUG], [ + AC_MSG_CHECKING([whether debugging is enabled]) + AC_ARG_ENABLE( [debug], + AS_HELP_STRING([--enable-debug], + [Enable generic debug support (default off)]), + [ case "$enableval" in + yes) zfs_ac_debug=yes ;; + no) zfs_ac_debug=no ;; + *) AC_MSG_RESULT([Error!]) + AC_MSG_ERROR([Bad value "$enableval" for --enable-debug]) ;; + esac ] + ) + if test "$zfs_ac_debug" = yes; then + AC_MSG_RESULT([yes]) + KERNELCPPFLAGS="${KERNELCPPFLAGS} -DDEBUG " + HOSTCFLAGS="${HOSTCFLAGS} -DDEBUG " + else + AC_MSG_RESULT([no]) + AC_DEFINE([NDEBUG], [1], + [Define to 1 to disable debug tracing]) + KERNELCPPFLAGS="${KERNELCPPFLAGS} -DNDEBUG " + HOSTCFLAGS="${HOSTCFLAGS} -DNDEBUG " + fi +]) + +AC_DEFUN([ZFS_AC_SCRIPT_CONFIG], [ + SCRIPT_CONFIG=.script-config + rm -f ${SCRIPT_CONFIG} + echo "KERNELSRC=${LINUX}" >>${SCRIPT_CONFIG} + echo "KERNELBUILD=${LINUX_OBJ}" >>${SCRIPT_CONFIG} + echo "KERNELSRCVER=$kernsrcver" >>${SCRIPT_CONFIG} + echo >>${SCRIPT_CONFIG} + echo "SPLSRC=$splsrc" >>${SCRIPT_CONFIG} + echo "SPLBUILD=$splbuild" >>${SCRIPT_CONFIG} + echo "SPLSRCVER=$splsrcver" >>${SCRIPT_CONFIG} + echo "SPLSYMVERS=$splsymvers" >>${SCRIPT_CONFIG} + echo >>${SCRIPT_CONFIG} + echo "ZFSSRC=${TOPDIR}/src" >>${SCRIPT_CONFIG} + echo "ZFSBUILD=${ZFSDIR}" >>${SCRIPT_CONFIG} + echo >>${SCRIPT_CONFIG} + echo "TOPDIR=${TOPDIR} >>${SCRIPT_CONFIG} + echo "LIBDIR=${LIBDIR} >>${SCRIPT_CONFIG} + echo "CMDDIR=${CMDDIR} >>${SCRIPT_CONFIG} +]) + +dnl # +dnl # ZFS_LINUX_CONFTEST +dnl # +AC_DEFUN([ZFS_LINUX_CONFTEST], [ +cat >conftest.c <<_ACEOF +$1 +_ACEOF +]) + +dnl # +dnl # ZFS_LANG_PROGRAM(C)([PROLOGUE], [BODY]) +dnl # +m4_define([ZFS_LANG_PROGRAM], [ +$1 +int +main (void) +{ +dnl Do *not* indent the following line: there may be CPP directives. +dnl Don't move the `;' right after for the same reason. +$2 + ; + return 0; +} +]) + +dnl # +dnl # ZFS_LINUX_COMPILE_IFELSE / like AC_COMPILE_IFELSE +dnl # +AC_DEFUN([ZFS_LINUX_COMPILE_IFELSE], [ +m4_ifvaln([$1], [ZFS_LINUX_CONFTEST([$1])])dnl +rm -f build/conftest.o build/conftest.mod.c build/conftest.ko build/Makefile +echo "obj-m := conftest.o" >build/Makefile +dnl AS_IF([AC_TRY_COMMAND(cp conftest.c build && make [$2] CC="$CC" -f $PWD/build/Makefile LINUXINCLUDE="-Iinclude -include include/linux/autoconf.h" -o tmp_include_depends -o scripts -o include/config/MARKER -C $LINUX_OBJ EXTRA_CFLAGS="-Werror-implicit-function-declaration $EXTRA_KCFLAGS" $ARCH_UM SUBDIRS=$PWD/build) >/dev/null && AC_TRY_COMMAND([$3])], +AS_IF([AC_TRY_COMMAND(cp conftest.c build && make [$2] CC="$CC" LINUXINCLUDE="-Iinclude -include include/linux/autoconf.h" -o tmp_include_depends -o scripts -o include/config/MARKER -C $LINUX_OBJ EXTRA_CFLAGS="-Werror-implicit-function-declaration $EXTRA_KCFLAGS" $ARCH_UM M=$PWD/build) >/dev/null && AC_TRY_COMMAND([$3])], + [$4], + [_AC_MSG_LOG_CONFTEST +m4_ifvaln([$5],[$5])dnl])dnl +rm -f build/conftest.o build/conftest.mod.c build/conftest.mod.o build/conftest.ko m4_ifval([$1], [build/conftest.c conftest.c])[]dnl +]) + +dnl # +dnl # ZFS_LINUX_TRY_COMPILE like AC_TRY_COMPILE +dnl # +AC_DEFUN([ZFS_LINUX_TRY_COMPILE], + [ZFS_LINUX_COMPILE_IFELSE( + [AC_LANG_SOURCE([ZFS_LANG_PROGRAM([[$1]], [[$2]])])], + [modules], + [test -s build/conftest.o], + [$3], [$4]) +]) + +dnl # +dnl # ZFS_LINUX_CONFIG +dnl # +AC_DEFUN([ZFS_LINUX_CONFIG], + [AC_MSG_CHECKING([whether Linux was built with CONFIG_$1]) + ZFS_LINUX_TRY_COMPILE([ + #ifndef AUTOCONF_INCLUDED + #include + #endif + ],[ + #ifndef CONFIG_$1 + #error CONFIG_$1 not #defined + #endif + ],[ + AC_MSG_RESULT([yes]) + $2 + ],[ + AC_MSG_RESULT([no]) + $3 + ]) +]) + +dnl # +dnl # ZFS_CHECK_SYMBOL_EXPORT +dnl # check symbol exported or not +dnl # +AC_DEFUN([ZFS_CHECK_SYMBOL_EXPORT], + [AC_MSG_CHECKING([whether symbol $1 is exported]) + grep -q -E '[[[:space:]]]$1[[[:space:]]]' $LINUX/Module.symvers 2>/dev/null + rc=$? + if test $rc -ne 0; then + export=0 + for file in $2; do + grep -q -E "EXPORT_SYMBOL.*($1)" "$LINUX/$file" 2>/dev/null + rc=$? + if test $rc -eq 0; then + export=1 + break; + fi + done + if test $export -eq 0; then + AC_MSG_RESULT([no]) + $4 + else + AC_MSG_RESULT([yes]) + $3 + fi + else + AC_MSG_RESULT([yes]) + $3 + fi +]) + +dnl # +dnl # 2.6.x API change +dnl # bio_end_io_t uses 2 args (size was dropped from prototype) +dnl # +AC_DEFUN([ZFS_AC_2ARGS_BIO_END_IO_T], + [AC_MSG_CHECKING([whether bio_end_io_t wants 2 args]) + tmp_flags="$EXTRA_KCFLAGS" + EXTRA_KCFLAGS="-Werror" + ZFS_LINUX_TRY_COMPILE([ + #include + ],[ + void (*wanted_end_io)(struct bio *, int) = NULL; + bio_end_io_t *local_end_io; + + local_end_io = wanted_end_io; + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_2ARGS_BIO_END_IO_T, 1, + [bio_end_io_t wants 2 args]) + ],[ + AC_MSG_RESULT(no) + ]) + EXTRA_KCFLAGS="$tmp_flags" +]) diff --git a/autogen.sh b/autogen.sh new file mode 100755 index 0000000000..7834426bb2 --- /dev/null +++ b/autogen.sh @@ -0,0 +1,10 @@ +#!/bin/sh + +find . -type d -name .deps | xargs rm -rf +rm -rf config.guess config.sub ltmain.sh +libtoolize --automake +aclocal -I autoconf 2>/dev/null && +autoheader && +automake --add-missing --include-deps # 2>/dev/null && +autoconf + diff --git a/build/Makefile b/build/Makefile new file mode 100644 index 0000000000..f84b8e9ccf --- /dev/null +++ b/build/Makefile @@ -0,0 +1 @@ +obj-m := conftest.o diff --git a/configs/Makefile.am b/configs/Makefile.am new file mode 100644 index 0000000000..326b3d7601 --- /dev/null +++ b/configs/Makefile.am @@ -0,0 +1 @@ +EXTRA_DIST=kernel user lustre diff --git a/configs/kernel b/configs/kernel new file mode 100644 index 0000000000..fc1dfa3c59 --- /dev/null +++ b/configs/kernel @@ -0,0 +1,11 @@ +# Default ZFS kernel mode configuration + +UNAME=`uname -r | cut -d- -f1` + +CONFIG=user + +NAME=zfs +BRANCH=`awk '/[Bb]ranch:/ {print $$2}' META` +VERSION=`awk '/[Vv]ersion:/ {print $$2}' META` +RELEASE=`awk '/[Rr]elease:/ {print $$2}' META` +BUILDDIR=$NAME+$CONFIG diff --git a/configs/lustre b/configs/lustre new file mode 100644 index 0000000000..65b1603eee --- /dev/null +++ b/configs/lustre @@ -0,0 +1,11 @@ +# Default ZFS lustre mode configuration + +UNAME=`uname -r | cut -d- -f1` + +CONFIG=lustre + +NAME=zfs +BRANCH=`awk '/[Bb]ranch:/ {print $$2}' META` +VERSION=`awk '/[Vv]ersion:/ {print $$2}' META` +RELEASE=`awk '/[Rr]elease:/ {print $$2}' META` +BUILDDIR=$NAME+$CONFIG diff --git a/configs/user b/configs/user new file mode 100644 index 0000000000..f79ba8fa8e --- /dev/null +++ b/configs/user @@ -0,0 +1,9 @@ +# Default ZFS user mode configuration + +CONFIG=user + +NAME=zfs +BRANCH=`awk '/[Bb]ranch:/ {print $$2}' META` +VERSION=`awk '/[Vv]ersion:/ {print $$2}' META` +RELEASE=`awk '/[Rr]elease:/ {print $$2}' META` +BUILDDIR=$NAME+$CONFIG diff --git a/configure.ac b/configure.ac new file mode 100644 index 0000000000..7d099854b2 --- /dev/null +++ b/configure.ac @@ -0,0 +1,223 @@ +# +# This file is part of the ZFS Linux port. +# +# Copyright (c) 2008 Lawrence Livermore National Security, LLC. +# Produced at Lawrence Livermore National Laboratory +# Written by: +# Brian Behlendorf , +# Herb Wartens , +# Jim Garlick +# LLNL-CODE-403049 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +AC_INIT +AC_LANG(C) + +AC_CANONICAL_SYSTEM +AM_INIT_AUTOMAKE(zfs, 0.4.0) +AC_CONFIG_HEADERS([zfs_config.h]) + +AC_PROG_INSTALL +AC_PROG_CC +AC_PROG_LIBTOOL + +zfsconfig=kernel +kernelsrc= +kernelbuild= +splsrc= +splbuild= + +ZFS_AC_CONFIG +ZFS_AC_KERNEL +ZFS_AC_SPL +ZFS_AC_SCRIPT_CONFIG +ZFS_AC_LICENSE +ZFS_AC_DEBUG +ZFS_AC_2ARGS_BIO_END_IO_T + +AC_SUBST(UNAME) +AC_SUBST(CONFIG) +AC_SUBST(NAME) +AC_SUBST(SVNURL) +AC_SUBST(BRANCH) +AC_SUBST(VERSION) +AC_SUBST(RELEASE) +AC_SUBST(BRANCHURL) +AC_SUBST(TAGURL) +AC_SUBST(BUILDURL) +AC_SUBST(BUILDDIR) + +# Check for needed userspace bits +AC_CHECK_HEADERS(sys/types.h sys/byteorder.h sys/isa_defs.h \ + sys/systeminfo.h sys/u8_textprep.h libdiskmgt.h) + +AC_CHECK_FUNCS(strlcat strlcpy strnlen issetugid setmntent getexecname) + +AC_CHECK_LIB([diskmgt], [libdiskmgt_error], + [AC_DEFINE([HAVE_LIBDISKMGT], 1, + [Define to 1 if 'libdiskmgt' library available])]) + +AC_CHECK_LIB([efi], [efi_alloc_and_init], + [AC_DEFINE([HAVE_LIBEFI], 1, + [Define to 1 if 'libefi' library available])]) + +AC_CHECK_LIB([share], [sa_init], + [AC_DEFINE([HAVE_LIBSHARE], 1, + [Define to 1 if 'libshare' library available])]) + +AC_EGREP_HEADER(ioctl, unistd.h, + [AC_DEFINE([HAVE_IOCTL_IN_UNISTD_H], 1, + [Define to 1 if ioctl() is defined in header file])]) + +AC_EGREP_HEADER(ioctl, sys/ioctl.h, + [AC_DEFINE([HAVE_IOCTL_IN_SYS_IOCTL_H], 1, + [Define to 1 if ioctl() is defined in header file])]) + +AC_EGREP_HEADER(ioctl, stropts.h, + [AC_DEFINE([HAVE_IOCTL_IN_STROPTS_H], 1, + [Define to 1 if ioctl() is defined in header file])]) + +AC_EGREP_HEADER(strcmp, strings.h, + [AC_DEFINE([HAVE_STRCMP_IN_STRINGS_H], 1, + [Define to 1 if strcmpl() is defined in header file])]) + +AC_EGREP_HEADER(sysinfo, sys/systeminfo.h, + [AC_DEFINE([HAVE_SYSINFO_IN_SYS_SYSTEMINFO_H], 1, + [Define to 1 if sysinfo() is defined in header file])]) + +#AC_DEFINE([HAVE_ZVOL], 1, ["Define to 1 to include ZVOL support"]) +#AC_DEFINE([HAVE_ZPL], 1, ["Define to 1 to include ZPL support"]) +#AC_DEFINE([WANT_FAKE_IOCTL], 1, ["Define to 1 to use fake ioctl() support"]) +#AC_DEFINE([HAVE_DM_INUSE_SWAP], 1, ["None"]) +#AC_DEFINE([HAVE_UNICODE], 1, ["None"]) +#AC_DEFINE([HAVE_INTTYPES], 1, [Define to 1 if unint16 defined in header file]) + +# Add "V=1" to KERNELMAKE_PARAMS to enable verbose module build +KERNELMAKE_PARAMS= +KERNELCPPFLAGS="$KERNELCPPFLAGS -DHAVE_SPL -D_KERNEL -I$splsrc -I$splsrc/include -I$TOPDIR" + +# Minimally required for pread() functionality an other GNU goodness +HOSTCFLAGS="$HOSTCFLAGS -ggdb -O2 -std=c99 -D_GNU_SOURCE -D__EXTENSIONS__ " +# Quiet warnings not covered by the gcc-* patches +HOSTCFLAGS="$HOSTCFLAGS -Wno-switch -Wno-unused -Wno-missing-braces -Wno-parentheses " +HOSTCFLAGS="$HOSTCFLAGS -Wno-uninitialized -fno-strict-aliasing " +# Expected defines not covered by zfs_config.h +HOSTCFLAGS="$HOSTCFLAGS -DHAVE_SPL -D_POSIX_PTHREAD_SEMANTICS " +HOSTCFLAGS="$HOSTCFLAGS -D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE -D_REENTRANT " +HOSTCFLAGS="$HOSTCFLAGS -DTEXT_DOMAIN=\\\"zfs-linux-kernel\\\" " +# Expected default include paths additional paths added by Makefiles +HOSTCFLAGS="$HOSTCFLAGS -I$TOPDIR " + +if test "$kernelbuild" != "$kernelsrc"; then + KERNELMAKE_PARAMS="$KERNELMAKE_PARAMS O=$kernelbuild" +fi + +AC_SUBST(KERNELMAKE_PARAMS) +AC_SUBST(KERNELCPPFLAGS) +AC_SUBST(HOSTCFLAGS) + +AC_CONFIG_FILES([ Makefile + autoconf/Makefile + configs/Makefile + doc/Makefile + scripts/Makefile + zfs/Makefile + zfs/lib/libudmu/include/Makefile + zfs/lib/libudmu/Makefile + zfs/lib/Makefile + zfs/lib/libnvpair/include/sys/Makefile + zfs/lib/libnvpair/include/Makefile + zfs/lib/libnvpair/Makefile + zfs/lib/libsolcompat/sparc64/Makefile + zfs/lib/libsolcompat/Makefile + zfs/lib/libsolcompat/include/tsol/Makefile + zfs/lib/libsolcompat/include/sparc64/sys/Makefile + zfs/lib/libsolcompat/include/sparc64/Makefile + zfs/lib/libsolcompat/include/rpc/Makefile + zfs/lib/libsolcompat/include/i386/sys/Makefile + zfs/lib/libsolcompat/include/i386/Makefile + zfs/lib/libsolcompat/include/ia32/sys/Makefile + zfs/lib/libsolcompat/include/ia32/Makefile + zfs/lib/libsolcompat/include/amd64/sys/Makefile + zfs/lib/libsolcompat/include/amd64/Makefile + zfs/lib/libsolcompat/include/sys/sysevent/Makefile + zfs/lib/libsolcompat/include/sys/fm/Makefile + zfs/lib/libsolcompat/include/sys/Makefile + zfs/lib/libsolcompat/include/Makefile + zfs/lib/libsolcompat/i386/Makefile + zfs/lib/libsolcompat/amd64/Makefile + zfs/lib/libavl/include/sys/Makefile + zfs/lib/libavl/include/Makefile + zfs/lib/libavl/Makefile + zfs/lib/libuutil/include/Makefile + zfs/lib/libuutil/Makefile + zfs/lib/libzfs/include/Makefile + zfs/lib/libzfs/Makefile + zfs/lib/libumem/include/Makefile + zfs/lib/libumem/Makefile + zfs/lib/libumem/sys/Makefile + zfs/lib/libzcommon/include/Makefile + zfs/lib/libzcommon/include/sys/fm/fs/Makefile + zfs/lib/libzcommon/include/sys/fm/Makefile + zfs/lib/libzcommon/include/sys/Makefile + zfs/lib/libzcommon/include/sys/fs/Makefile + zfs/lib/libzcommon/Makefile + zfs/lib/libzpool/Makefile + zfs/lib/libport/include/sys/Makefile + zfs/lib/libport/include/Makefile + zfs/lib/libport/Makefile + zfs/lib/libdmu-ctl/include/sys/Makefile + zfs/lib/libdmu-ctl/include/Makefile + zfs/lib/libdmu-ctl/Makefile + zfs/zcmd/ztest/Makefile + zfs/zcmd/Makefile + zfs/zcmd/zfs/Makefile + zfs/zcmd/zdb/Makefile + zfs/zcmd/zinject/Makefile + zfs/zcmd/zdump/Makefile + zfs/zcmd/zpool/Makefile + ]) +AC_OUTPUT + +# HACK: I really, really hate this... but to ensure the kernel build +# system compiles C files shared between a library and a kernel module, +# we need to ensure each file has a unique make target. To do that +# I'm creating symlinks for each shared file at configure time. It +# may be possible something better can be done in the Makefile but it +# will take some serious investigation and I don't have the time now. + +echo +echo "Creating symlinks for additional make targets" +ln -s $LIBDIR/libport/u8_textprep.c $LIBDIR/libport/ku8_textprep.c +ln -s $LIBDIR/libavl/avl.c $LIBDIR/libavl/kavl.c +ln -s $LIBDIR/libavl/avl.c $LIBDIR/libavl/uavl.c +ln -s $LIBDIR/libnvpair/nvpair.c $LIBDIR/libnvpair/knvpair.c +ln -s $LIBDIR/libnvpair/nvpair.c $LIBDIR/libnvpair/unvpair.c +ln -s $LIBDIR/libzcommon/zfs_deleg.c $LIBDIR/libzcommon/kzfs_deleg.c +ln -s $LIBDIR/libzcommon/zfs_prop.c $LIBDIR/libzcommon/kzfs_prop.c +ln -s $LIBDIR/libzcommon/zprop_common.c $LIBDIR/libzcommon/kzprop_common.c +ln -s $LIBDIR/libzcommon/compress.c $LIBDIR/libzcommon/kcompress.c +ln -s $LIBDIR/libzcommon/list.c $LIBDIR/libzcommon/klist.c +ln -s $LIBDIR/libzcommon/zfs_namecheck.c $LIBDIR/libzcommon/kzfs_namecheck.c +ln -s $LIBDIR/libzcommon/zfs_comutil.c $LIBDIR/libzcommon/kzfs_comutil.c +ln -s $LIBDIR/libzcommon/zpool_prop.c $LIBDIR/libzcommon/kzpool_prop.c diff --git a/doc/LEGAL b/doc/LEGAL new file mode 100644 index 0000000000..905141bcab --- /dev/null +++ b/doc/LEGAL @@ -0,0 +1,113 @@ +From: Chris Dunlap +To: tak1@llnl.gov (James Tak) +Cc: rogers11@llnl.gov (Leah Rogers), garlick@llnl.gov (Jim Garlick), + mgary@llnl.gov (Mark Gary), kimcupps@llnl.gov (Kim Cupps) +Date: Mon, 26 Mar 2007 15:37:07 -0700 +Subject: CDDL/GPL licensing issues for ZFS Linux port + +James, + +We want to port Sun's Zettabyte File System (ZFS) to Linux and +ultimately redistribute the source code of our work. We've been +talking with Leah about this and have a meeting scheduled with you +for this coming Thursday at 2pm. I just wanted to give you a summary +before the meeting of what we're proposing. + +ZFS is part of OpenSolaris which is licensed under the Common +Development and Distribution License (CDDL): + + http://www.opensolaris.org/os/licensing/cddllicense.txt + +The Linux kernel is licensed under the GNU General Public License (GPL) +(specifically, under version 2 of the license only): + + http://www.fsf.org/licensing/licenses/gpl.html + +While these are both Open-Source licenses, the Free Software Foundation +(FSF) states they are incompatible with one another: + + http://www.fsf.org/licensing/licenses/index_html + + "[CDDL] is a free software license which is not a strong copyleft; + it has some complex restrictions that make it incompatible with the + GNU GPL. It requires that all attribution notices be maintained, + while the GPL only requires certain types of notices. Also, it + terminates in retaliation for certain aggressive uses of patents. + So, a module covered by the GPL and a module covered by the CDDL + cannot legally be linked together." + +As an aside, Sun is reportedly considering releasing OpenSolaris under +GPL3 (i.e., the upcoming version 3 of the GNU General Public License): + + http://blogs.sun.com/jonathan/entry/hp_and_sun_partnering_around + + http://arstechnica.com/news.ars/post/20060130-6074.html + + http://news.com.com/Sun+considers+GPL+3+license+for+Solaris/2100-1016_3-6032893.html + +Since the GPL3 has not been finalized, it is unclear whether +incompatibilities will exist between GPL2 and GPL3. + +Linus Torvalds (the original creator of Linux) describes his views +on the licensing of Linux kernel modules in the following email thread: + + http://linuxmafia.com/faq/Kernel/proprietary-kernel-modules.html + +Most of this thread is in regards to proprietary closed-source +binary-only modules for Linux. Linus generally considers modules +written for Linux using the kernel infrastructures to be derived +works of Linux, even if they don't copy any existing Linux code. +However, he specifically singles out drivers and filesystems ported +from other operating systems as not being derived works: + + "It would be rather preposterous to call the Andrew FileSystem a + 'derived work' of Linux, for example, so I think it's perfectly + OK to have a AFS module, for example." + + "The original binary-only modules were for things that were + pre-existing works of code, i.e., drivers and filesystems ported + from other operating systems, which thus could clearly be argued + to not be derived works..." + +Based on this, it seems our port of Sun's ZFS filesystem to Linux +would not be considered a derived work of Linux, and therefore not +covered by the GPL. The issue of the CDDL/GPL license incompatibility +becomes moot. As such, we should be able to redistribute our changes +to ZFS in source-code form licensed under the CDDL since this will +be a derived work of the original ZFS code. There seems to be some +dissent as to whether a binary module could be redistributed as well, +but that issue does not concern us. In this instance, we are only +interested in redistribution of our work in source-code form. + +-Chris + +To: Chris Dunlap +From: James Tak +Subject: Re: CDDL/GPL licensing issues for ZFS Linux port +Cc: rogers11@llnl.gov (Leah Rogers), garlick@llnl.gov (Jim Garlick), + mgary@llnl.gov (Mark Gary), kimcupps@llnl.gov (Kim Cupps) +Date: Thu, 29 Mar 2007 14:53:01 -0700 + +Hi Chris, +As per our discussion today, the ZFS port you are proposing releasing under +the CDDL license should be o.k. since it is a derivative work of the +original ZFS module (under CDDL) and is therefore also subject to CDDL +under the distribution terms of that license. While the issue of linking +has been greatly debated in the OS community, I think it is fair to say in +this instance the ZFS port is not a derivative work of Linux and thus not +subject to the GPL. Furthermore, it shouldn't be a problem especially +since even Linus Torvald has expressed that modules such as yours are not +derived works of Linux. + +Let me know if you have any further questions at x27274. Thanks. + +Regards, +James + +James S. Tak +Assistant Laboratory Counsel for Intellectual Property +Office of Laboratory Counsel +Lawrence Livermore National Laboratory +phone: (925) 422-7274 +fax: (925) 423-2231 +tak1@llnl.gov diff --git a/doc/Makefile.am b/doc/Makefile.am new file mode 100644 index 0000000000..07f6386f45 --- /dev/null +++ b/doc/Makefile.am @@ -0,0 +1 @@ +EXTRA_DIST = LEGAL diff --git a/doc/analysis-spl-0.3.1/zfs-report.doc b/doc/analysis-spl-0.3.1/zfs-report.doc new file mode 100644 index 0000000000..34638b2ead Binary files /dev/null and b/doc/analysis-spl-0.3.1/zfs-report.doc differ diff --git a/doc/analysis-spl-0.3.1/zfs-report.odt b/doc/analysis-spl-0.3.1/zfs-report.odt new file mode 100644 index 0000000000..fd52861136 Binary files /dev/null and b/doc/analysis-spl-0.3.1/zfs-report.odt differ diff --git a/doc/analysis-spl-0.3.1/zfs-report.pdf b/doc/analysis-spl-0.3.1/zfs-report.pdf new file mode 100644 index 0000000000..8bf166709d Binary files /dev/null and b/doc/analysis-spl-0.3.1/zfs-report.pdf differ diff --git a/doc/analysis-spl-0.3.1/zfs-testing.ods b/doc/analysis-spl-0.3.1/zfs-testing.ods new file mode 100644 index 0000000000..39a84e4377 Binary files /dev/null and b/doc/analysis-spl-0.3.1/zfs-testing.ods differ diff --git a/doc/analysis-spl-0.3.1/zfs-testing.xls b/doc/analysis-spl-0.3.1/zfs-testing.xls new file mode 100644 index 0000000000..f6de1e9847 Binary files /dev/null and b/doc/analysis-spl-0.3.1/zfs-testing.xls differ diff --git a/lib/libsolcompat/zu.c b/lib/libsolcompat/zu.c new file mode 100644 index 0000000000..31dbf45bf9 --- /dev/null +++ b/lib/libsolcompat/zu.c @@ -0,0 +1,4 @@ +int main(void) +{ + return 0; +} diff --git a/lib/libumem/zu.c b/lib/libumem/zu.c new file mode 100644 index 0000000000..31dbf45bf9 --- /dev/null +++ b/lib/libumem/zu.c @@ -0,0 +1,4 @@ +int main(void) +{ + return 0; +} diff --git a/lib/libuutil/zu.c b/lib/libuutil/zu.c new file mode 100644 index 0000000000..31dbf45bf9 --- /dev/null +++ b/lib/libuutil/zu.c @@ -0,0 +1,4 @@ +int main(void) +{ + return 0; +} diff --git a/lib/libzfs/zu.c b/lib/libzfs/zu.c new file mode 100644 index 0000000000..31dbf45bf9 --- /dev/null +++ b/lib/libzfs/zu.c @@ -0,0 +1,4 @@ +int main(void) +{ + return 0; +} diff --git a/patches/README b/patches/README new file mode 100644 index 0000000000..31af59708f --- /dev/null +++ b/patches/README @@ -0,0 +1,17 @@ +# +# Mostly patches for a userspace build. For now I'm leaving them all +# out until we go through the code base and sort out the userspace +# portion of the build system. We may find we do not want or need +# many of these patches anymore. -Brian +# +zap-cursor-move-to-key.patch # Add a ZAP API to move a ZAP cursor to a +given key. +spa-force-readonly.patch # Add API to discard all writes +no-debug-userspace.patch # Disable debug code on userspace +no-events.patch # Define away spa_event_notify() in userspace +pthreads.patch # Use POSIX threads in userspace. +port-no-zmod.patch # Do not use zmod.h in userspace. +port-pragma-init.patch # Use constructor attribute on non-Solaris +platforms. +lztest-lzdb.patch # Make lztest call lzdb from PATH. +zpool-force.patch # Change -f to -F in zpool command diff --git a/patches/lztest-lzdb.patch b/patches/lztest-lzdb.patch new file mode 100644 index 0000000000..37870b7493 --- /dev/null +++ b/patches/lztest-lzdb.patch @@ -0,0 +1,40 @@ +Make lztest call lzdb from PATH. + +Index: zfs+chaos4/cmd/lztest/ztest.c +=================================================================== +--- zfs+chaos4.orig/cmd/lztest/ztest.c ++++ zfs+chaos4/cmd/lztest/ztest.c +@@ -3043,30 +3043,17 @@ ztest_verify_blocks(char *pool) + char zbuf[1024]; + char *bin; + char *ztest; +- char *isa; +- int isalen; + FILE *fp; + +- (void) realpath(getexecname(), zdb); +- +- /* zdb lives in /usr/sbin, while ztest lives in /usr/bin */ +- bin = strstr(zdb, "/usr/bin/"); +- ztest = strstr(bin, "/ztest"); +- isa = bin + 8; +- isalen = ztest - isa; +- isa = strdup(isa); + /* LINTED */ +- (void) sprintf(bin, +- "/usr/sbin%.*s/zdb -bc%s%s -U /tmp/zpool.cache -O %s %s", +- isalen, +- isa, ++ (void) sprintf(zdb, ++ "lzdb -bc%s%s -U /tmp/zpool.cache -O %s %s", + zopt_verbose >= 3 ? "s" : "", + zopt_verbose >= 4 ? "v" : "", + ztest_random(2) == 0 ? "pre" : "post", pool); +- free(isa); + + if (zopt_verbose >= 5) +- (void) printf("Executing %s\n", strstr(zdb, "zdb ")); ++ (void) printf("Executing %s\n", strstr(zdb, "lzdb ")); + + fp = popen(zdb, "r"); + diff --git a/patches/no-debug-userspace.patch b/patches/no-debug-userspace.patch new file mode 100644 index 0000000000..616a69153c --- /dev/null +++ b/patches/no-debug-userspace.patch @@ -0,0 +1,184 @@ +Disable debug code on userspace + +Index: zfs+chaos4/lib/libzfscommon/include/sys/arc.h +=================================================================== +--- zfs+chaos4.orig/lib/libzfscommon/include/sys/arc.h ++++ zfs+chaos4/lib/libzfscommon/include/sys/arc.h +@@ -82,7 +82,7 @@ int arc_released(arc_buf_t *buf); + int arc_has_callback(arc_buf_t *buf); + void arc_buf_freeze(arc_buf_t *buf); + void arc_buf_thaw(arc_buf_t *buf); +-#ifdef ZFS_DEBUG ++#if defined(ZFS_DEBUG) || (!defined(_KERNEL) && !defined(NDEBUG)) + int arc_referenced(arc_buf_t *buf); + #endif + +Index: zfs+chaos4/lib/libzfscommon/include/sys/refcount.h +=================================================================== +--- zfs+chaos4.orig/lib/libzfscommon/include/sys/refcount.h ++++ zfs+chaos4/lib/libzfscommon/include/sys/refcount.h +@@ -43,7 +43,7 @@ extern "C" { + */ + #define FTAG ((char *)__func__) + +-#if defined(DEBUG) || !defined(_KERNEL) ++#if defined(DEBUG) + typedef struct reference { + list_node_t ref_link; + void *ref_holder; +Index: zfs+chaos4/lib/libzfscommon/include/sys/zfs_context_user.h +=================================================================== +--- zfs+chaos4.orig/lib/libzfscommon/include/sys/zfs_context_user.h ++++ zfs+chaos4/lib/libzfscommon/include/sys/zfs_context_user.h +@@ -96,6 +96,8 @@ extern "C" { + + #ifdef ZFS_DEBUG + extern void dprintf_setup(int *argc, char **argv); ++#else ++#define dprintf_setup(ac,av) ((void) 0) + #endif /* ZFS_DEBUG */ + + extern void cmn_err(int, const char *, ...); +@@ -105,21 +107,26 @@ extern void vpanic(const char *, __va_li + + #define fm_panic panic + ++#ifndef zp_verify + /* This definition is copied from assert.h. */ + #if defined(__STDC__) + #if __STDC_VERSION__ - 0 >= 199901L +-#define verify(EX) (void)((EX) || \ ++#define zp_verify(EX) (void)((EX) || \ + (__assert_c99(#EX, __FILE__, __LINE__, __func__), 0)) + #else +-#define verify(EX) (void)((EX) || (__assert(#EX, __FILE__, __LINE__), 0)) ++#define zp_verify(EX) (void)((EX) || (__assert(#EX, __FILE__, __LINE__), 0)) + #endif /* __STDC_VERSION__ - 0 >= 199901L */ + #else +-#define verify(EX) (void)((EX) || (_assert("EX", __FILE__, __LINE__), 0)) ++#define zp_verify(EX) (void)((EX) || (_assert("EX", __FILE__, __LINE__), 0)) + #endif /* __STDC__ */ ++#endif + +- +-#define VERIFY verify ++#ifndef VERIFY ++#define VERIFY zp_verify ++#endif ++#ifndef ASSERT + #define ASSERT assert ++#endif + + extern void __assert(const char *, const char *, int); + +@@ -332,6 +339,7 @@ extern int taskq_member(taskq_t *, void + typedef struct vnode { + uint64_t v_size; + int v_fd; ++ mode_t v_mode; + char *v_path; + } vnode_t; + +Index: zfs+chaos4/lib/libzfscommon/include/sys/zfs_debug.h +=================================================================== +--- zfs+chaos4.orig/lib/libzfscommon/include/sys/zfs_debug.h ++++ zfs+chaos4/lib/libzfscommon/include/sys/zfs_debug.h +@@ -44,7 +44,7 @@ extern "C" { + * ZFS debugging + */ + +-#if defined(DEBUG) || !defined(_KERNEL) ++#if defined(DEBUG) + #define ZFS_DEBUG + #endif + +Index: zfs+chaos4/lib/libzpool/arc.c +=================================================================== +--- zfs+chaos4.orig/lib/libzpool/arc.c ++++ zfs+chaos4/lib/libzpool/arc.c +@@ -1802,7 +1802,7 @@ arc_reclaim_needed(void) + return (1); + #endif + +-#else ++#elif defined(ZFS_DEBUG) + if (spa_get_random(100) == 0) + return (1); + #endif +@@ -2881,7 +2881,7 @@ arc_has_callback(arc_buf_t *buf) + return (buf->b_efunc != NULL); + } + +-#ifdef ZFS_DEBUG ++#if defined(ZFS_DEBUG) || (!defined(_KERNEL) && !defined(NDEBUG)) + int + arc_referenced(arc_buf_t *buf) + { +Index: zfs+chaos4/lib/libzpool/kernel.c +=================================================================== +--- zfs+chaos4.orig/lib/libzpool/kernel.c ++++ zfs+chaos4/lib/libzpool/kernel.c +@@ -384,6 +384,7 @@ vn_open(char *path, int x1, int flags, i + + vp->v_fd = fd; + vp->v_size = st.st_size; ++ vp->v_mode = st.st_mode; + vp->v_path = spa_strdup(path); + + return (0); +@@ -422,10 +423,17 @@ vn_rdwr(int uio, vnode_t *vp, void *addr + * To simulate partial disk writes, we split writes into two + * system calls so that the process can be killed in between. + */ +- split = (len > 0 ? rand() % len : 0); +- iolen = pwrite64(vp->v_fd, addr, split, offset); +- iolen += pwrite64(vp->v_fd, (char *)addr + split, +- len - split, offset + split); ++#ifdef ZFS_DEBUG ++ if (!S_ISBLK(vp->v_mode) && !S_ISCHR(vp->v_mode)) { ++ split = (len > 0 ? rand() % len : 0); ++ iolen = pwrite64(vp->v_fd, addr, split, offset); ++ iolen += pwrite64(vp->v_fd, (char *)addr + split, ++ len - split, offset + split); ++ } else ++ iolen = pwrite64(vp->v_fd, addr, len, offset); ++#else ++ iolen = pwrite64(vp->v_fd, addr, len, offset); ++#endif + } + + if (iolen < 0) +Index: zfs+chaos4/lib/libzpool/refcount.c +=================================================================== +--- zfs+chaos4.orig/lib/libzpool/refcount.c ++++ zfs+chaos4/lib/libzpool/refcount.c +@@ -28,7 +28,7 @@ + #include + #include + +-#if defined(DEBUG) || !defined(_KERNEL) ++#if defined(DEBUG) + + #ifdef _KERNEL + int reference_tracking_enable = FALSE; /* runs out of memory too easily */ +Index: zfs+chaos4/lib/libzpool/spa_misc.c +=================================================================== +--- zfs+chaos4.orig/lib/libzpool/spa_misc.c ++++ zfs+chaos4/lib/libzpool/spa_misc.c +@@ -178,11 +178,15 @@ kmem_cache_t *spa_buffer_pool; + int spa_mode; + + #ifdef ZFS_DEBUG ++#ifdef _KERNEL + /* Everything except dprintf is on by default in debug builds */ + int zfs_flags = ~ZFS_DEBUG_DPRINTF; + #else ++int zfs_flags = ~0; ++#endif /* _KERNEL */ ++#else + int zfs_flags = 0; +-#endif ++#endif /* ZFS_DEBUG */ + + /* + * zfs_recover can be set to nonzero to attempt to recover from diff --git a/patches/no-events.patch b/patches/no-events.patch new file mode 100644 index 0000000000..054b7ae17c --- /dev/null +++ b/patches/no-events.patch @@ -0,0 +1,44 @@ +Define away spa_event_notify() in userspace - not necessary and breaks compilation in older Solaris builds. + +Index: zfs+chaos4/lib/libzfscommon/include/sys/spa.h +=================================================================== +--- zfs+chaos4.orig/lib/libzfscommon/include/sys/spa.h ++++ zfs+chaos4/lib/libzfscommon/include/sys/spa.h +@@ -516,7 +516,11 @@ extern int spa_prop_get(spa_t *spa, nvli + extern void spa_prop_clear_bootfs(spa_t *spa, uint64_t obj, dmu_tx_t *tx); + + /* asynchronous event notification */ ++#ifdef _KERNEL + extern void spa_event_notify(spa_t *spa, vdev_t *vdev, const char *name); ++#else ++#define spa_event_notify(s,v,n) ((void) 0) ++#endif + + #ifdef ZFS_DEBUG + #define dprintf_bp(bp, fmt, ...) do { \ +Index: zfs+chaos4/lib/libzpool/spa.c +=================================================================== +--- zfs+chaos4.orig/lib/libzpool/spa.c ++++ zfs+chaos4/lib/libzpool/spa.c +@@ -4449,10 +4449,10 @@ spa_has_spare(spa_t *spa, uint64_t guid) + * in the userland libzpool, as we don't want consumers to misinterpret ztest + * or zdb as real changes. + */ ++#ifdef _KERNEL + void + spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) + { +-#ifdef _KERNEL + sysevent_t *ev; + sysevent_attr_list_t *attr = NULL; + sysevent_value_t value; +@@ -4497,8 +4497,8 @@ done: + if (attr) + sysevent_free_attr(attr); + sysevent_free(ev); +-#endif + } ++#endif + + void + spa_discard_io(spa_t *spa) diff --git a/patches/port-no-zmod.patch b/patches/port-no-zmod.patch new file mode 100644 index 0000000000..34cabd1fd8 --- /dev/null +++ b/patches/port-no-zmod.patch @@ -0,0 +1,112 @@ +Do not use zmod.h in userspace. + +Index: zfs+chaos4/lib/libzpool/gzip.c +=================================================================== +--- zfs+chaos4.orig/lib/libzpool/gzip.c ++++ zfs+chaos4/lib/libzpool/gzip.c +@@ -28,22 +28,35 @@ + + #include + #include +-#include + + #ifdef _KERNEL ++ + #include +-#else ++#include ++ ++typedef size_t zlen_t; ++#define compress_func z_compress_level ++#define uncompress_func z_uncompress ++ ++#else /* _KERNEL */ ++ + #include ++#include ++ ++typedef uLongf zlen_t; ++#define compress_func compress2 ++#define uncompress_func uncompress ++ + #endif + + size_t + gzip_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) + { +- size_t dstlen = d_len; ++ zlen_t dstlen = d_len; + + ASSERT(d_len <= s_len); + +- if (z_compress_level(d_start, &dstlen, s_start, s_len, n) != Z_OK) { ++ if (compress_func(d_start, &dstlen, s_start, s_len, n) != Z_OK) { + if (d_len != s_len) + return (s_len); + +@@ -51,18 +64,18 @@ gzip_compress(void *s_start, void *d_sta + return (s_len); + } + +- return (dstlen); ++ return ((size_t) dstlen); + } + + /*ARGSUSED*/ + int + gzip_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) + { +- size_t dstlen = d_len; ++ zlen_t dstlen = d_len; + + ASSERT(d_len >= s_len); + +- if (z_uncompress(d_start, &dstlen, s_start, s_len) != Z_OK) ++ if (uncompress_func(d_start, &dstlen, s_start, s_len) != Z_OK) + return (-1); + + return (0); +Index: zfs+chaos4/lib/libzpool/kernel.c +=================================================================== +--- zfs+chaos4.orig/lib/libzpool/kernel.c ++++ zfs+chaos4/lib/libzpool/kernel.c +@@ -36,7 +36,6 @@ + #include + #include + #include +-#include + #include + #include + +@@ -876,31 +875,6 @@ kernel_fini(void) + urandom_fd = -1; + } + +-int +-z_uncompress(void *dst, size_t *dstlen, const void *src, size_t srclen) +-{ +- int ret; +- uLongf len = *dstlen; +- +- if ((ret = uncompress(dst, &len, src, srclen)) == Z_OK) +- *dstlen = (size_t)len; +- +- return (ret); +-} +- +-int +-z_compress_level(void *dst, size_t *dstlen, const void *src, size_t srclen, +- int level) +-{ +- int ret; +- uLongf len = *dstlen; +- +- if ((ret = compress2(dst, &len, src, srclen, level)) == Z_OK) +- *dstlen = (size_t)len; +- +- return (ret); +-} +- + /*ARGSUSED*/ + size_t u8_textprep_str(char *i, size_t *il, char *o, size_t *ol, int nf, + size_t vers, int *err) diff --git a/patches/port-pragma-init.patch b/patches/port-pragma-init.patch new file mode 100644 index 0000000000..665aa9b62a --- /dev/null +++ b/patches/port-pragma-init.patch @@ -0,0 +1,52 @@ +Use constructor attribute on non-Solaris platforms. + +Index: zfs+chaos4/lib/libuutil/uu_misc.c +=================================================================== +--- zfs+chaos4.orig/lib/libuutil/uu_misc.c ++++ zfs+chaos4/lib/libuutil/uu_misc.c +@@ -251,7 +251,13 @@ uu_release_child(void) + uu_release(); + } + ++#ifdef __GNUC__ ++static void ++uu_init(void) __attribute__((constructor)); ++#else + #pragma init(uu_init) ++#endif ++ + static void + uu_init(void) + { +Index: zfs+chaos4/lib/libzfs/libzfs_mount.c +=================================================================== +--- zfs+chaos4.orig/lib/libzfs/libzfs_mount.c ++++ zfs+chaos4/lib/libzfs/libzfs_mount.c +@@ -128,7 +128,13 @@ zfs_share_proto_t share_all_proto[] = { + PROTO_END + }; + ++#ifdef __GNUC__ ++static void ++zfs_iscsi_init(void) __attribute__((constructor)); ++#else + #pragma init(zfs_iscsi_init) ++#endif ++ + static void + zfs_iscsi_init(void) + { +@@ -548,8 +554,12 @@ static void (*_sa_update_sharetab_ts)(sa + * values to be used later. This is triggered by the runtime loader. + * Make sure the correct ISA version is loaded. + */ +- ++#ifdef __GNUC__ ++static void ++_zfs_init_libshare(void) __attribute__((constructor)); ++#else + #pragma init(_zfs_init_libshare) ++#endif + static void + _zfs_init_libshare(void) + { diff --git a/patches/pthreads.patch b/patches/pthreads.patch new file mode 100644 index 0000000000..ec29eb915a --- /dev/null +++ b/patches/pthreads.patch @@ -0,0 +1,924 @@ +Use POSIX threads in userspace. + +Index: zfs+chaos4/cmd/lztest/ztest.c +=================================================================== +--- zfs+chaos4.orig/cmd/lztest/ztest.c ++++ zfs+chaos4/cmd/lztest/ztest.c +@@ -141,7 +141,7 @@ typedef struct ztest_args { + spa_t *za_spa; + objset_t *za_os; + zilog_t *za_zilog; +- thread_t za_thread; ++ pthread_t za_thread; + uint64_t za_instance; + uint64_t za_random; + uint64_t za_diroff; +@@ -224,17 +224,17 @@ ztest_info_t ztest_info[] = { + * Stuff we need to share writably between parent and child. + */ + typedef struct ztest_shared { +- mutex_t zs_vdev_lock; +- rwlock_t zs_name_lock; +- uint64_t zs_vdev_primaries; +- uint64_t zs_enospc_count; +- hrtime_t zs_start_time; +- hrtime_t zs_stop_time; +- uint64_t zs_alloc; +- uint64_t zs_space; +- ztest_info_t zs_info[ZTEST_FUNCS]; +- mutex_t zs_sync_lock[ZTEST_SYNC_LOCKS]; +- uint64_t zs_seq[ZTEST_SYNC_LOCKS]; ++ pthread_mutex_t zs_vdev_lock; ++ pthread_rwlock_t zs_name_lock; ++ uint64_t zs_vdev_primaries; ++ uint64_t zs_enospc_count; ++ hrtime_t zs_start_time; ++ hrtime_t zs_stop_time; ++ uint64_t zs_alloc; ++ uint64_t zs_space; ++ ztest_info_t zs_info[ZTEST_FUNCS]; ++ pthread_mutex_t zs_sync_lock[ZTEST_SYNC_LOCKS]; ++ uint64_t zs_seq[ZTEST_SYNC_LOCKS]; + } ztest_shared_t; + + static char ztest_dev_template[] = "%s/%s.%llua"; +@@ -818,7 +818,7 @@ ztest_spa_create_destroy(ztest_args_t *z + * Attempt to create an existing pool. It shouldn't matter + * what's in the nvroot; we should fail with EEXIST. + */ +- (void) rw_rdlock(&ztest_shared->zs_name_lock); ++ (void) pthread_rwlock_rdlock(&ztest_shared->zs_name_lock); + nvroot = make_vdev_root(0, 0, 0, 0, 1); + error = spa_create(za->za_pool, nvroot, NULL, NULL); + nvlist_free(nvroot); +@@ -834,7 +834,7 @@ ztest_spa_create_destroy(ztest_args_t *z + fatal(0, "spa_destroy() = %d", error); + + spa_close(spa, FTAG); +- (void) rw_unlock(&ztest_shared->zs_name_lock); ++ (void) pthread_rwlock_unlock(&ztest_shared->zs_name_lock); + } + + /* +@@ -851,7 +851,7 @@ ztest_vdev_add_remove(ztest_args_t *za) + if (zopt_verbose >= 6) + (void) printf("adding vdev\n"); + +- (void) mutex_lock(&ztest_shared->zs_vdev_lock); ++ (void) pthread_mutex_lock(&ztest_shared->zs_vdev_lock); + + spa_config_enter(spa, RW_READER, FTAG); + +@@ -869,7 +869,7 @@ ztest_vdev_add_remove(ztest_args_t *za) + error = spa_vdev_add(spa, nvroot); + nvlist_free(nvroot); + +- (void) mutex_unlock(&ztest_shared->zs_vdev_lock); ++ (void) pthread_mutex_unlock(&ztest_shared->zs_vdev_lock); + + if (error == ENOSPC) + ztest_record_enospc("spa_vdev_add"); +@@ -927,7 +927,7 @@ ztest_vdev_attach_detach(ztest_args_t *z + int error, expected_error; + int fd; + +- (void) mutex_lock(&ztest_shared->zs_vdev_lock); ++ (void) pthread_mutex_lock(&ztest_shared->zs_vdev_lock); + + spa_config_enter(spa, RW_READER, FTAG); + +@@ -1054,7 +1054,7 @@ ztest_vdev_attach_detach(ztest_args_t *z + oldpath, newpath, replacing, error, expected_error); + } + +- (void) mutex_unlock(&ztest_shared->zs_vdev_lock); ++ (void) pthread_mutex_unlock(&ztest_shared->zs_vdev_lock); + } + + /* +@@ -1071,7 +1071,7 @@ ztest_vdev_LUN_growth(ztest_args_t *za) + size_t fsize; + int fd; + +- (void) mutex_lock(&ztest_shared->zs_vdev_lock); ++ (void) pthread_mutex_lock(&ztest_shared->zs_vdev_lock); + + /* + * Pick a random leaf vdev. +@@ -1102,7 +1102,7 @@ ztest_vdev_LUN_growth(ztest_args_t *za) + (void) close(fd); + } + +- (void) mutex_unlock(&ztest_shared->zs_vdev_lock); ++ (void) pthread_mutex_unlock(&ztest_shared->zs_vdev_lock); + } + + /* ARGSUSED */ +@@ -1198,7 +1198,7 @@ ztest_dmu_objset_create_destroy(ztest_ar + uint64_t objects; + ztest_replay_t zr; + +- (void) rw_rdlock(&ztest_shared->zs_name_lock); ++ (void) pthread_rwlock_rdlock(&ztest_shared->zs_name_lock); + (void) snprintf(name, 100, "%s/%s_temp_%llu", za->za_pool, za->za_pool, + (u_longlong_t)za->za_instance); + +@@ -1242,7 +1242,7 @@ ztest_dmu_objset_create_destroy(ztest_ar + if (error) { + if (error == ENOSPC) { + ztest_record_enospc("dmu_objset_create"); +- (void) rw_unlock(&ztest_shared->zs_name_lock); ++ (void) pthread_rwlock_unlock(&ztest_shared->zs_name_lock); + return; + } + fatal(0, "dmu_objset_create(%s) = %d", name, error); +@@ -1321,7 +1321,7 @@ ztest_dmu_objset_create_destroy(ztest_ar + if (error) + fatal(0, "dmu_objset_destroy(%s) = %d", name, error); + +- (void) rw_unlock(&ztest_shared->zs_name_lock); ++ (void) pthread_rwlock_unlock(&ztest_shared->zs_name_lock); + } + + /* +@@ -1335,7 +1335,7 @@ ztest_dmu_snapshot_create_destroy(ztest_ + char snapname[100]; + char osname[MAXNAMELEN]; + +- (void) rw_rdlock(&ztest_shared->zs_name_lock); ++ (void) pthread_rwlock_rdlock(&ztest_shared->zs_name_lock); + dmu_objset_name(os, osname); + (void) snprintf(snapname, 100, "%s@%llu", osname, + (u_longlong_t)za->za_instance); +@@ -1348,7 +1348,7 @@ ztest_dmu_snapshot_create_destroy(ztest_ + ztest_record_enospc("dmu_take_snapshot"); + else if (error != 0 && error != EEXIST) + fatal(0, "dmu_take_snapshot() = %d", error); +- (void) rw_unlock(&ztest_shared->zs_name_lock); ++ (void) pthread_rwlock_unlock(&ztest_shared->zs_name_lock); + } + + #define ZTEST_TRAVERSE_BLOCKS 1000 +@@ -1992,7 +1992,7 @@ ztest_dmu_write_parallel(ztest_args_t *z + int bs = ZTEST_DIROBJ_BLOCKSIZE; + int do_free = 0; + uint64_t off, txg_how; +- mutex_t *lp; ++ pthread_mutex_t *lp; + char osname[MAXNAMELEN]; + char iobuf[SPA_MAXBLOCKSIZE]; + blkptr_t blk = { 0 }; +@@ -2041,7 +2041,7 @@ ztest_dmu_write_parallel(ztest_args_t *z + } + + lp = &ztest_shared->zs_sync_lock[b]; +- (void) mutex_lock(lp); ++ (void) pthread_mutex_lock(lp); + + wbt->bt_objset = dmu_objset_id(os); + wbt->bt_object = ZTEST_DIROBJ; +@@ -2087,7 +2087,7 @@ ztest_dmu_write_parallel(ztest_args_t *z + dmu_write(os, ZTEST_DIROBJ, off, btsize, wbt, tx); + } + +- (void) mutex_unlock(lp); ++ (void) pthread_mutex_unlock(lp); + + if (ztest_random(1000) == 0) + (void) poll(NULL, 0, 1); /* open dn_notxholds window */ +@@ -2106,7 +2106,7 @@ ztest_dmu_write_parallel(ztest_args_t *z + /* + * dmu_sync() the block we just wrote. + */ +- (void) mutex_lock(lp); ++ (void) pthread_mutex_lock(lp); + + blkoff = P2ALIGN_TYPED(off, bs, uint64_t); + error = dmu_buf_hold(os, ZTEST_DIROBJ, blkoff, FTAG, &db); +@@ -2114,7 +2114,7 @@ ztest_dmu_write_parallel(ztest_args_t *z + if (error) { + dprintf("dmu_buf_hold(%s, %d, %llx) = %d\n", + osname, ZTEST_DIROBJ, blkoff, error); +- (void) mutex_unlock(lp); ++ (void) pthread_mutex_unlock(lp); + return; + } + blkoff = off - blkoff; +@@ -2122,7 +2122,7 @@ ztest_dmu_write_parallel(ztest_args_t *z + dmu_buf_rele(db, FTAG); + za->za_dbuf = NULL; + +- (void) mutex_unlock(lp); ++ (void) pthread_mutex_unlock(lp); + + if (error) { + dprintf("dmu_sync(%s, %d, %llx) = %d\n", +@@ -2502,7 +2502,7 @@ ztest_dsl_prop_get_set(ztest_args_t *za) + char osname[MAXNAMELEN]; + int error; + +- (void) rw_rdlock(&ztest_shared->zs_name_lock); ++ (void) pthread_rwlock_rdlock(&ztest_shared->zs_name_lock); + + dmu_objset_name(os, osname); + +@@ -2541,7 +2541,7 @@ ztest_dsl_prop_get_set(ztest_args_t *za) + } + } + +- (void) rw_unlock(&ztest_shared->zs_name_lock); ++ (void) pthread_rwlock_unlock(&ztest_shared->zs_name_lock); + } + + static void +@@ -2693,7 +2693,7 @@ ztest_spa_rename(ztest_args_t *za) + int error; + spa_t *spa; + +- (void) rw_wrlock(&ztest_shared->zs_name_lock); ++ (void) pthread_rwlock_wrlock(&ztest_shared->zs_name_lock); + + oldname = za->za_pool; + newname = umem_alloc(strlen(oldname) + 5, UMEM_NOFAIL); +@@ -2745,7 +2745,7 @@ ztest_spa_rename(ztest_args_t *za) + + umem_free(newname, strlen(newname) + 1); + +- (void) rw_unlock(&ztest_shared->zs_name_lock); ++ (void) pthread_rwlock_unlock(&ztest_shared->zs_name_lock); + } + + +@@ -3090,13 +3090,13 @@ ztest_run(char *pool) + ztest_args_t *za; + spa_t *spa; + char name[100]; +- thread_t tid; ++ pthread_t tid; + +- (void) _mutex_init(&zs->zs_vdev_lock, USYNC_THREAD, NULL); +- (void) rwlock_init(&zs->zs_name_lock, USYNC_THREAD, NULL); ++ (void) pthread_mutex_init(&zs->zs_vdev_lock, NULL); ++ (void) pthread_rwlock_init(&zs->zs_name_lock, NULL); + + for (t = 0; t < ZTEST_SYNC_LOCKS; t++) +- (void) _mutex_init(&zs->zs_sync_lock[t], USYNC_THREAD, NULL); ++ (void) pthread_mutex_init(&zs->zs_sync_lock[t], NULL); + + /* + * Destroy one disk before we even start. +@@ -3153,7 +3153,7 @@ ztest_run(char *pool) + * start the thread before setting the zio_io_fail_shift, which + * will indicate our failure rate. + */ +- error = thr_create(0, 0, ztest_suspend_monitor, NULL, THR_BOUND, &tid); ++ error = pthread_create(&tid, NULL, ztest_suspend_monitor, NULL); + if (error) { + fatal(0, "can't create suspend monitor thread: error %d", + t, error); +@@ -3217,7 +3217,7 @@ ztest_run(char *pool) + if (t < zopt_datasets) { + ztest_replay_t zr; + int test_future = FALSE; +- (void) rw_rdlock(&ztest_shared->zs_name_lock); ++ (void) pthread_rwlock_rdlock(&ztest_shared->zs_name_lock); + (void) snprintf(name, 100, "%s/%s_%d", pool, pool, d); + error = dmu_objset_create(name, DMU_OST_OTHER, NULL, 0, + ztest_create_cb, NULL); +@@ -3225,7 +3225,7 @@ ztest_run(char *pool) + test_future = TRUE; + } else if (error == ENOSPC) { + zs->zs_enospc_count++; +- (void) rw_unlock(&ztest_shared->zs_name_lock); ++ (void) pthread_rwlock_unlock(&ztest_shared->zs_name_lock); + break; + } else if (error != 0) { + fatal(0, "dmu_objset_create(%s) = %d", +@@ -3236,7 +3236,7 @@ ztest_run(char *pool) + if (error) + fatal(0, "dmu_objset_open('%s') = %d", + name, error); +- (void) rw_unlock(&ztest_shared->zs_name_lock); ++ (void) pthread_rwlock_unlock(&ztest_shared->zs_name_lock); + if (test_future) + ztest_dmu_check_future_leak(&za[t]); + zr.zr_os = za[d].za_os; +@@ -3245,15 +3245,15 @@ ztest_run(char *pool) + za[d].za_zilog = zil_open(za[d].za_os, NULL); + } + +- error = thr_create(0, 0, ztest_thread, &za[t], THR_BOUND, +- &za[t].za_thread); ++ error = pthread_create(&za[t].za_thread, NULL, ztest_thread, ++ &za[t]); + if (error) + fatal(0, "can't create thread %d: error %d", + t, error); + } + + while (--t >= 0) { +- error = thr_join(za[t].za_thread, NULL, NULL); ++ error = pthread_join(za[t].za_thread, NULL); + if (error) + fatal(0, "thr_join(%d) = %d", t, error); + if (za[t].za_th) +@@ -3276,14 +3276,14 @@ ztest_run(char *pool) + * If we had out-of-space errors, destroy a random objset. + */ + if (zs->zs_enospc_count != 0) { +- (void) rw_rdlock(&ztest_shared->zs_name_lock); ++ (void) pthread_rwlock_rdlock(&ztest_shared->zs_name_lock); + d = (int)ztest_random(zopt_datasets); + (void) snprintf(name, 100, "%s/%s_%d", pool, pool, d); + if (zopt_verbose >= 3) + (void) printf("Destroying %s to free up space\n", name); + (void) dmu_objset_find(name, ztest_destroy_cb, &za[d], + DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); +- (void) rw_unlock(&ztest_shared->zs_name_lock); ++ (void) pthread_rwlock_unlock(&ztest_shared->zs_name_lock); + } + + txg_wait_synced(spa_get_dsl(spa), 0); +@@ -3301,7 +3301,7 @@ ztest_run(char *pool) + mutex_enter(&spa->spa_zio_lock); + cv_broadcast(&spa->spa_zio_cv); + mutex_exit(&spa->spa_zio_lock); +- error = thr_join(tid, NULL, NULL); ++ error = pthread_join(tid, NULL); + if (error) + fatal(0, "thr_join(%d) = %d", tid, error); + +Index: zfs+chaos4/lib/libuutil/uu_misc.c +=================================================================== +--- zfs+chaos4.orig/lib/libuutil/uu_misc.c ++++ zfs+chaos4/lib/libuutil/uu_misc.c +@@ -37,7 +37,6 @@ + #include + #include + #include +-#include + #include + + #if !defined(TEXT_DOMAIN) +@@ -70,11 +69,12 @@ static va_list uu_panic_args; + static pthread_t uu_panic_thread; + + static uint32_t _uu_main_error; ++static __thread int _uu_main_thread = 0; + + void + uu_set_error(uint_t code) + { +- if (thr_main() != 0) { ++ if (_uu_main_thread) { + _uu_main_error = code; + return; + } +@@ -103,7 +103,7 @@ uu_set_error(uint_t code) + uint32_t + uu_error(void) + { +- if (thr_main() != 0) ++ if (_uu_main_thread) + return (_uu_main_error); + + if (uu_error_key_setup < 0) /* can't happen? */ +@@ -255,5 +255,6 @@ uu_release_child(void) + static void + uu_init(void) + { ++ _uu_main_thread = 1; + (void) pthread_atfork(uu_lockup, uu_release, uu_release_child); + } +Index: zfs+chaos4/lib/libzfscommon/include/sys/zfs_context_user.h +=================================================================== +--- zfs+chaos4.orig/lib/libzfscommon/include/sys/zfs_context_user.h ++++ zfs+chaos4/lib/libzfscommon/include/sys/zfs_context_user.h +@@ -52,8 +52,7 @@ extern "C" { + #include + #include + #include +-#include +-#include ++#include + #include + #include + #include +@@ -191,13 +190,15 @@ _NOTE(CONSTCOND) } while (0) + /* + * Threads + */ +-#define curthread ((void *)(uintptr_t)thr_self()) ++ ++/* XXX: not portable */ ++#define curthread ((void *)(uintptr_t)pthread_self()) + + typedef struct kthread kthread_t; + + #define thread_create(stk, stksize, func, arg, len, pp, state, pri) \ + zk_thread_create(func, arg) +-#define thread_exit() thr_exit(NULL) ++#define thread_exit() pthread_exit(NULL) + + extern kthread_t *zk_thread_create(void (*func)(), void *arg); + +@@ -207,28 +208,18 @@ extern kthread_t *zk_thread_create(void + /* + * Mutexes + */ ++#define MTX_MAGIC 0x9522f51362a6e326ull + typedef struct kmutex { + void *m_owner; +- boolean_t initialized; +- mutex_t m_lock; ++ uint64_t m_magic; ++ pthread_mutex_t m_lock; + } kmutex_t; + +-#define MUTEX_DEFAULT USYNC_THREAD +-#undef MUTEX_HELD +-#define MUTEX_HELD(m) _mutex_held(&(m)->m_lock) +- +-/* +- * Argh -- we have to get cheesy here because the kernel and userland +- * have different signatures for the same routine. +- */ +-extern int _mutex_init(mutex_t *mp, int type, void *arg); +-extern int _mutex_destroy(mutex_t *mp); +- +-#define mutex_init(mp, b, c, d) zmutex_init((kmutex_t *)(mp)) +-#define mutex_destroy(mp) zmutex_destroy((kmutex_t *)(mp)) ++#define MUTEX_DEFAULT 0 ++#define MUTEX_HELD(m) ((m)->m_owner == curthread) + +-extern void zmutex_init(kmutex_t *mp); +-extern void zmutex_destroy(kmutex_t *mp); ++extern void mutex_init(kmutex_t *mp, char *name, int type, void *cookie); ++extern void mutex_destroy(kmutex_t *mp); + extern void mutex_enter(kmutex_t *mp); + extern void mutex_exit(kmutex_t *mp); + extern int mutex_tryenter(kmutex_t *mp); +@@ -237,23 +228,24 @@ extern void *mutex_owner(kmutex_t *mp); + /* + * RW locks + */ ++#define RW_MAGIC 0x4d31fb123648e78aull + typedef struct krwlock { +- void *rw_owner; +- boolean_t initialized; +- rwlock_t rw_lock; ++ void *rw_owner; ++ void *rw_wr_owner; ++ uint64_t rw_magic; ++ pthread_rwlock_t rw_lock; ++ uint_t rw_readers; + } krwlock_t; + + typedef int krw_t; + + #define RW_READER 0 + #define RW_WRITER 1 +-#define RW_DEFAULT USYNC_THREAD +- +-#undef RW_READ_HELD +-#define RW_READ_HELD(x) _rw_read_held(&(x)->rw_lock) ++#define RW_DEFAULT 0 + +-#undef RW_WRITE_HELD +-#define RW_WRITE_HELD(x) _rw_write_held(&(x)->rw_lock) ++#define RW_READ_HELD(x) ((x)->rw_readers > 0) ++#define RW_WRITE_HELD(x) ((x)->rw_wr_owner == curthread) ++#define RW_LOCK_HELD(x) (RW_READ_HELD(x) || RW_WRITE_HELD(x)) + + extern void rw_init(krwlock_t *rwlp, char *name, int type, void *arg); + extern void rw_destroy(krwlock_t *rwlp); +@@ -271,9 +263,13 @@ extern gid_t *crgetgroups(cred_t *cr); + /* + * Condition variables + */ +-typedef cond_t kcondvar_t; ++#define CV_MAGIC 0xd31ea9a83b1b30c4ull ++typedef struct kcondvar { ++ uint64_t cv_magic; ++ pthread_cond_t cv; ++} kcondvar_t; + +-#define CV_DEFAULT USYNC_THREAD ++#define CV_DEFAULT 0 + + extern void cv_init(kcondvar_t *cv, char *name, int type, void *arg); + extern void cv_destroy(kcondvar_t *cv); +@@ -444,7 +440,8 @@ extern void delay(clock_t ticks); + #define minclsyspri 60 + #define maxclsyspri 99 + +-#define CPU_SEQID (thr_self() & (max_ncpus - 1)) ++/* XXX: not portable */ ++#define CPU_SEQID (pthread_self() & (max_ncpus - 1)) + + #define kcred NULL + #define CRED() NULL +Index: zfs+chaos4/lib/libzpool/kernel.c +=================================================================== +--- zfs+chaos4.orig/lib/libzpool/kernel.c ++++ zfs+chaos4/lib/libzpool/kernel.c +@@ -38,6 +38,7 @@ + #include + #include + #include ++#include + + /* + * Emulation of kernel services in userland. +@@ -60,11 +61,15 @@ struct utsname utsname = { + kthread_t * + zk_thread_create(void (*func)(), void *arg) + { +- thread_t tid; ++ pthread_t tid; + +- VERIFY(thr_create(0, 0, (void *(*)(void *))func, arg, THR_DETACHED, +- &tid) == 0); ++ pthread_attr_t attr; ++ VERIFY(pthread_attr_init(&attr) == 0); ++ VERIFY(pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED) == 0); + ++ VERIFY(pthread_create(&tid, &attr, (void *(*)(void *))func, arg) == 0); ++ ++ /* XXX: not portable */ + return ((void *)(uintptr_t)tid); + } + +@@ -97,30 +102,37 @@ kstat_delete(kstat_t *ksp) + * ========================================================================= + */ + void +-zmutex_init(kmutex_t *mp) ++mutex_init(kmutex_t *mp, char *name, int type, void *cookie) + { ++ ASSERT(type == MUTEX_DEFAULT); ++ ASSERT(cookie == NULL); ++ ++#ifdef IM_FEELING_LUCKY ++ ASSERT(mp->m_magic != MTX_MAGIC); ++#endif ++ + mp->m_owner = NULL; +- mp->initialized = B_TRUE; +- (void) _mutex_init(&mp->m_lock, USYNC_THREAD, NULL); ++ mp->m_magic = MTX_MAGIC; ++ VERIFY3S(pthread_mutex_init(&mp->m_lock, NULL), ==, 0); + } + + void +-zmutex_destroy(kmutex_t *mp) ++mutex_destroy(kmutex_t *mp) + { +- ASSERT(mp->initialized == B_TRUE); ++ ASSERT(mp->m_magic == MTX_MAGIC); + ASSERT(mp->m_owner == NULL); +- (void) _mutex_destroy(&(mp)->m_lock); ++ VERIFY3S(pthread_mutex_destroy(&(mp)->m_lock), ==, 0); + mp->m_owner = (void *)-1UL; +- mp->initialized = B_FALSE; ++ mp->m_magic = 0; + } + + void + mutex_enter(kmutex_t *mp) + { +- ASSERT(mp->initialized == B_TRUE); ++ ASSERT(mp->m_magic == MTX_MAGIC); + ASSERT(mp->m_owner != (void *)-1UL); + ASSERT(mp->m_owner != curthread); +- VERIFY(mutex_lock(&mp->m_lock) == 0); ++ VERIFY3S(pthread_mutex_lock(&mp->m_lock), ==, 0); + ASSERT(mp->m_owner == NULL); + mp->m_owner = curthread; + } +@@ -128,9 +140,9 @@ mutex_enter(kmutex_t *mp) + int + mutex_tryenter(kmutex_t *mp) + { +- ASSERT(mp->initialized == B_TRUE); ++ ASSERT(mp->m_magic == MTX_MAGIC); + ASSERT(mp->m_owner != (void *)-1UL); +- if (0 == mutex_trylock(&mp->m_lock)) { ++ if (0 == pthread_mutex_trylock(&mp->m_lock)) { + ASSERT(mp->m_owner == NULL); + mp->m_owner = curthread; + return (1); +@@ -142,16 +154,16 @@ mutex_tryenter(kmutex_t *mp) + void + mutex_exit(kmutex_t *mp) + { +- ASSERT(mp->initialized == B_TRUE); ++ ASSERT(mp->m_magic == MTX_MAGIC); + ASSERT(mutex_owner(mp) == curthread); + mp->m_owner = NULL; +- VERIFY(mutex_unlock(&mp->m_lock) == 0); ++ VERIFY3S(pthread_mutex_unlock(&mp->m_lock), ==, 0); + } + + void * + mutex_owner(kmutex_t *mp) + { +- ASSERT(mp->initialized == B_TRUE); ++ ASSERT(mp->m_magic == MTX_MAGIC); + return (mp->m_owner); + } + +@@ -164,31 +176,48 @@ mutex_owner(kmutex_t *mp) + void + rw_init(krwlock_t *rwlp, char *name, int type, void *arg) + { +- rwlock_init(&rwlp->rw_lock, USYNC_THREAD, NULL); ++ ASSERT(type == RW_DEFAULT); ++ ASSERT(arg == NULL); ++ ++#ifdef IM_FEELING_LUCKY ++ ASSERT(rwlp->rw_magic != RW_MAGIC); ++#endif ++ ++ VERIFY3S(pthread_rwlock_init(&rwlp->rw_lock, NULL), ==, 0); + rwlp->rw_owner = NULL; +- rwlp->initialized = B_TRUE; ++ rwlp->rw_wr_owner = NULL; ++ rwlp->rw_readers = 0; ++ rwlp->rw_magic = RW_MAGIC; + } + + void + rw_destroy(krwlock_t *rwlp) + { +- rwlock_destroy(&rwlp->rw_lock); +- rwlp->rw_owner = (void *)-1UL; +- rwlp->initialized = B_FALSE; ++ ASSERT(rwlp->rw_magic == RW_MAGIC); ++ ++ VERIFY3S(pthread_rwlock_destroy(&rwlp->rw_lock), ==, 0); ++ rwlp->rw_magic = 0; + } + + void + rw_enter(krwlock_t *rwlp, krw_t rw) + { +- ASSERT(!RW_LOCK_HELD(rwlp)); +- ASSERT(rwlp->initialized == B_TRUE); +- ASSERT(rwlp->rw_owner != (void *)-1UL); ++ ASSERT(rwlp->rw_magic == RW_MAGIC); + ASSERT(rwlp->rw_owner != curthread); ++ ASSERT(rwlp->rw_wr_owner != curthread); + +- if (rw == RW_READER) +- (void) rw_rdlock(&rwlp->rw_lock); +- else +- (void) rw_wrlock(&rwlp->rw_lock); ++ if (rw == RW_READER) { ++ VERIFY3S(pthread_rwlock_rdlock(&rwlp->rw_lock), ==, 0); ++ ASSERT(rwlp->rw_wr_owner == NULL); ++ ++ atomic_inc_uint(&rwlp->rw_readers); ++ } else { ++ VERIFY3S(pthread_rwlock_wrlock(&rwlp->rw_lock), ==, 0); ++ ASSERT(rwlp->rw_wr_owner == NULL); ++ ASSERT3U(rwlp->rw_readers, ==, 0); ++ ++ rwlp->rw_wr_owner = curthread; ++ } + + rwlp->rw_owner = curthread; + } +@@ -196,11 +225,16 @@ rw_enter(krwlock_t *rwlp, krw_t rw) + void + rw_exit(krwlock_t *rwlp) + { +- ASSERT(rwlp->initialized == B_TRUE); +- ASSERT(rwlp->rw_owner != (void *)-1UL); ++ ASSERT(rwlp->rw_magic == RW_MAGIC); ++ ASSERT(RW_LOCK_HELD(rwlp)); ++ ++ if (RW_READ_HELD(rwlp)) ++ atomic_dec_uint(&rwlp->rw_readers); ++ else ++ rwlp->rw_wr_owner = NULL; + + rwlp->rw_owner = NULL; +- (void) rw_unlock(&rwlp->rw_lock); ++ VERIFY3S(pthread_rwlock_unlock(&rwlp->rw_lock), ==, 0); + } + + int +@@ -208,19 +242,29 @@ rw_tryenter(krwlock_t *rwlp, krw_t rw) + { + int rv; + +- ASSERT(rwlp->initialized == B_TRUE); +- ASSERT(rwlp->rw_owner != (void *)-1UL); ++ ASSERT(rwlp->rw_magic == RW_MAGIC); + + if (rw == RW_READER) +- rv = rw_tryrdlock(&rwlp->rw_lock); ++ rv = pthread_rwlock_tryrdlock(&rwlp->rw_lock); + else +- rv = rw_trywrlock(&rwlp->rw_lock); ++ rv = pthread_rwlock_trywrlock(&rwlp->rw_lock); + + if (rv == 0) { ++ ASSERT(rwlp->rw_wr_owner == NULL); ++ ++ if (rw == RW_READER) ++ atomic_inc_uint(&rwlp->rw_readers); ++ else { ++ ASSERT3U(rwlp->rw_readers, ==, 0); ++ rwlp->rw_wr_owner = curthread; ++ } ++ + rwlp->rw_owner = curthread; + return (1); + } + ++ VERIFY3S(rv, ==, EBUSY); ++ + return (0); + } + +@@ -228,8 +272,7 @@ rw_tryenter(krwlock_t *rwlp, krw_t rw) + int + rw_tryupgrade(krwlock_t *rwlp) + { +- ASSERT(rwlp->initialized == B_TRUE); +- ASSERT(rwlp->rw_owner != (void *)-1UL); ++ ASSERT(rwlp->rw_magic == RW_MAGIC); + + return (0); + } +@@ -243,22 +286,34 @@ rw_tryupgrade(krwlock_t *rwlp) + void + cv_init(kcondvar_t *cv, char *name, int type, void *arg) + { +- VERIFY(cond_init(cv, type, NULL) == 0); ++ ASSERT(type == CV_DEFAULT); ++ ++#ifdef IM_FEELING_LUCKY ++ ASSERT(cv->cv_magic != CV_MAGIC); ++#endif ++ ++ cv->cv_magic = CV_MAGIC; ++ ++ VERIFY3S(pthread_cond_init(&cv->cv, NULL), ==, 0); + } + + void + cv_destroy(kcondvar_t *cv) + { +- VERIFY(cond_destroy(cv) == 0); ++ ASSERT(cv->cv_magic == CV_MAGIC); ++ VERIFY3S(pthread_cond_destroy(&cv->cv), ==, 0); ++ cv->cv_magic = 0; + } + + void + cv_wait(kcondvar_t *cv, kmutex_t *mp) + { ++ ASSERT(cv->cv_magic == CV_MAGIC); + ASSERT(mutex_owner(mp) == curthread); + mp->m_owner = NULL; +- int ret = cond_wait(cv, &mp->m_lock); +- VERIFY(ret == 0 || ret == EINTR); ++ int ret = pthread_cond_wait(&cv->cv, &mp->m_lock); ++ if (ret != 0) ++ VERIFY3S(ret, ==, EINTR); + mp->m_owner = curthread; + } + +@@ -266,29 +321,38 @@ clock_t + cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime) + { + int error; ++ struct timeval tv; + timestruc_t ts; + clock_t delta; + ++ ASSERT(cv->cv_magic == CV_MAGIC); ++ + top: + delta = abstime - lbolt; + if (delta <= 0) + return (-1); + +- ts.tv_sec = delta / hz; +- ts.tv_nsec = (delta % hz) * (NANOSEC / hz); ++ VERIFY(gettimeofday(&tv, NULL) == 0); ++ ++ ts.tv_sec = tv.tv_sec + delta / hz; ++ ts.tv_nsec = tv.tv_usec * 1000 + (delta % hz) * (NANOSEC / hz); ++ if (ts.tv_nsec >= NANOSEC) { ++ ts.tv_sec++; ++ ts.tv_nsec -= NANOSEC; ++ } + + ASSERT(mutex_owner(mp) == curthread); + mp->m_owner = NULL; +- error = cond_reltimedwait(cv, &mp->m_lock, &ts); ++ error = pthread_cond_timedwait(&cv->cv, &mp->m_lock, &ts); + mp->m_owner = curthread; + +- if (error == ETIME) ++ if (error == ETIMEDOUT) + return (-1); + + if (error == EINTR) + goto top; + +- ASSERT(error == 0); ++ VERIFY3S(error, ==, 0); + + return (1); + } +@@ -296,13 +360,15 @@ top: + void + cv_signal(kcondvar_t *cv) + { +- VERIFY(cond_signal(cv) == 0); ++ ASSERT(cv->cv_magic == CV_MAGIC); ++ VERIFY3S(pthread_cond_signal(&cv->cv), ==, 0); + } + + void + cv_broadcast(kcondvar_t *cv) + { +- VERIFY(cond_broadcast(cv) == 0); ++ ASSERT(cv->cv_magic == CV_MAGIC); ++ VERIFY3S(pthread_cond_broadcast(&cv->cv), ==, 0); + } + + /* +@@ -549,11 +615,11 @@ __dprintf(const char *file, const char * + dprintf_find_string(func)) { + /* Print out just the function name if requested */ + flockfile(stdout); +- /* XXX: the following printf may not be portable */ ++ /* XXX: the following 2 printfs may not be portable */ + if (dprintf_find_string("pid")) + (void) printf("%llu ", (u_longlong_t) getpid()); + if (dprintf_find_string("tid")) +- (void) printf("%u ", (uint_t) thr_self()); ++ (void) printf("%u ", (uint_t) pthread_self()); + if (dprintf_find_string("cpu")) + (void) printf("%u ", getcpuid()); + if (dprintf_find_string("time")) +Index: zfs+chaos4/lib/libzpool/taskq.c +=================================================================== +--- zfs+chaos4.orig/lib/libzpool/taskq.c ++++ zfs+chaos4/lib/libzpool/taskq.c +@@ -43,7 +43,7 @@ struct taskq { + krwlock_t tq_threadlock; + kcondvar_t tq_dispatch_cv; + kcondvar_t tq_wait_cv; +- thread_t *tq_threadlist; ++ pthread_t *tq_threadlist; + int tq_flags; + int tq_active; + int tq_nthreads; +@@ -186,7 +186,7 @@ taskq_create(const char *name, int nthre + tq->tq_maxalloc = maxalloc; + tq->tq_task.task_next = &tq->tq_task; + tq->tq_task.task_prev = &tq->tq_task; +- tq->tq_threadlist = kmem_alloc(nthreads * sizeof (thread_t), KM_SLEEP); ++ tq->tq_threadlist = kmem_alloc(nthreads * sizeof (pthread_t), KM_SLEEP); + + if (flags & TASKQ_PREPOPULATE) { + mutex_enter(&tq->tq_lock); +@@ -196,8 +196,8 @@ taskq_create(const char *name, int nthre + } + + for (t = 0; t < nthreads; t++) +- VERIFY(thr_create(0, 0, taskq_thread, +- tq, THR_BOUND, &tq->tq_threadlist[t]) == 0); ++ VERIFY(pthread_create(&tq->tq_threadlist[t], ++ NULL, taskq_thread, tq) == 0); + + return (tq); + } +@@ -227,9 +227,9 @@ taskq_destroy(taskq_t *tq) + mutex_exit(&tq->tq_lock); + + for (t = 0; t < nthreads; t++) +- VERIFY(thr_join(tq->tq_threadlist[t], NULL, NULL) == 0); ++ VERIFY(pthread_join(tq->tq_threadlist[t], NULL) == 0); + +- kmem_free(tq->tq_threadlist, nthreads * sizeof (thread_t)); ++ kmem_free(tq->tq_threadlist, nthreads * sizeof (pthread_t)); + + rw_destroy(&tq->tq_threadlock); + mutex_destroy(&tq->tq_lock); +@@ -248,7 +248,7 @@ taskq_member(taskq_t *tq, void *t) + return (1); + + for (i = 0; i < tq->tq_nthreads; i++) +- if (tq->tq_threadlist[i] == (thread_t)(uintptr_t)t) ++ if (tq->tq_threadlist[i] == (pthread_t)(uintptr_t)t) + return (1); + + return (0); diff --git a/patches/zap-cursor-move-to-key.patch b/patches/zap-cursor-move-to-key.patch new file mode 100644 index 0000000000..ad5bbb93d5 --- /dev/null +++ b/patches/zap-cursor-move-to-key.patch @@ -0,0 +1,115 @@ +Add a ZAP API to move a ZAP cursor to a given key. + +Index: zfs+chaos4/lib/libzfscommon/include/sys/zap.h +=================================================================== +--- zfs+chaos4.orig/lib/libzfscommon/include/sys/zap.h ++++ zfs+chaos4/lib/libzfscommon/include/sys/zap.h +@@ -302,6 +302,11 @@ void zap_cursor_advance(zap_cursor_t *zc + uint64_t zap_cursor_serialize(zap_cursor_t *zc); + + /* ++ * Advance the cursor to the attribute having the key. ++ */ ++int zap_cursor_move_to_key(zap_cursor_t *zc, const char *name, matchtype_t mt); ++ ++/* + * Initialize a zap cursor pointing to the position recorded by + * zap_cursor_serialize (in the "serialized" argument). You can also + * use a "serialized" argument of 0 to start at the beginning of the +Index: zfs+chaos4/lib/libzfscommon/include/sys/zap_impl.h +=================================================================== +--- zfs+chaos4.orig/lib/libzfscommon/include/sys/zap_impl.h ++++ zfs+chaos4/lib/libzfscommon/include/sys/zap_impl.h +@@ -210,6 +210,7 @@ int fzap_add_cd(zap_name_t *zn, + uint64_t integer_size, uint64_t num_integers, + const void *val, uint32_t cd, dmu_tx_t *tx); + void fzap_upgrade(zap_t *zap, dmu_tx_t *tx); ++int fzap_cursor_move_to_key(zap_cursor_t *zc, zap_name_t *zn); + + #ifdef __cplusplus + } +Index: zfs+chaos4/lib/libzpool/zap.c +=================================================================== +--- zfs+chaos4.orig/lib/libzpool/zap.c ++++ zfs+chaos4/lib/libzpool/zap.c +@@ -1029,6 +1029,30 @@ zap_stats_ptrtbl(zap_t *zap, uint64_t *t + } + } + ++int fzap_cursor_move_to_key(zap_cursor_t *zc, zap_name_t *zn) ++{ ++ int err; ++ zap_leaf_t *l; ++ zap_entry_handle_t zeh; ++ uint64_t hash; ++ ++ if (zn->zn_name_orij && strlen(zn->zn_name_orij) > ZAP_MAXNAMELEN) ++ return (E2BIG); ++ ++ err = zap_deref_leaf(zc->zc_zap, zn->zn_hash, NULL, RW_READER, &l); ++ if (err != 0) ++ return (err); ++ ++ err = zap_leaf_lookup(l, zn, &zeh); ++ if (err != 0) ++ return (err); ++ ++ zc->zc_leaf = l; ++ zc->zc_hash = zeh.zeh_hash; ++ zc->zc_cd = zeh.zeh_cd; ++ return 0; ++} ++ + void + fzap_get_stats(zap_t *zap, zap_stats_t *zs) + { +Index: zfs+chaos4/lib/libzpool/zap_micro.c +=================================================================== +--- zfs+chaos4.orig/lib/libzpool/zap_micro.c ++++ zfs+chaos4/lib/libzpool/zap_micro.c +@@ -1045,6 +1045,45 @@ zap_cursor_advance(zap_cursor_t *zc) + } + } + ++int zap_cursor_move_to_key(zap_cursor_t *zc, const char *name, matchtype_t mt) ++{ ++ int err = 0; ++ mzap_ent_t *mze; ++ zap_name_t *zn; ++ ++ if (zc->zc_zap == NULL) { ++ err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL, ++ RW_READER, TRUE, FALSE, &zc->zc_zap); ++ if (err) ++ return (err); ++ } else { ++ rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); ++ } ++ ++ zn = zap_name_alloc(zc->zc_zap, name, mt); ++ if (zn == NULL) { ++ rw_exit(&zc->zc_zap->zap_rwlock); ++ return (ENOTSUP); ++ } ++ ++ if (!zc->zc_zap->zap_ismicro) { ++ err = fzap_cursor_move_to_key(zc, zn); ++ } else { ++ mze = mze_find(zn); ++ if (mze == NULL) { ++ err = (ENOENT); ++ goto out; ++ } ++ zc->zc_hash = mze->mze_hash; ++ zc->zc_cd = mze->mze_phys.mze_cd; ++ } ++ ++out: ++ zap_name_free(zn); ++ rw_exit(&zc->zc_zap->zap_rwlock); ++ return (err); ++} ++ + int + zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs) + { diff --git a/scripts/Makefile.am b/scripts/Makefile.am new file mode 100644 index 0000000000..e708530758 --- /dev/null +++ b/scripts/Makefile.am @@ -0,0 +1,8 @@ +EXTRA_DIST = check.sh create-zpool.sh load-zfs.sh unload-zfs.sh +EXTRA_DIST += profile-kpios-disk.sh profile-kpios-pids.sh +EXTRA_DIST += profile-kpios-post.sh profile-kpios-pre.sh profile-kpios.sh +EXTRA_DIST += survey.sh update-zfs.sh zpios-jbod.sh zpios.sh + +check: + ./check.sh + diff --git a/scripts/check.sh b/scripts/check.sh new file mode 100755 index 0000000000..bf93ee564e --- /dev/null +++ b/scripts/check.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +prog=check.sh + +die() { + echo "${prog}: $1" >&2 + exit 1 +} + +if [ $(id -u) != 0 ]; then + die "Must run as root" +fi + +./load-zfs.sh || die "" +./unload-zfs.sh || die "" + +exit 0 diff --git a/scripts/create-zpool.sh b/scripts/create-zpool.sh new file mode 100755 index 0000000000..6be37e7881 --- /dev/null +++ b/scripts/create-zpool.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +prog=create-zpool.sh +. ../.script-config + +# Single disk ilc dev nodes +DEVICES="/dev/sda" + +# All disks in a Thumper config +#DEVICES="/dev/sda /dev/sdb /dev/sdc /dev/sdd /dev/sde /dev/sdf \ +# /dev/sdg /dev/sdh /dev/sdi /dev/sdj /dev/sdk /dev/sdl \ +# /dev/sdm /dev/sdn /dev/sdo /dev/sdp /dev/sdq /dev/sdr \ +# /dev/sds /dev/sdt /dev/sdu /dev/sdv /dev/sdw /dev/sdx \ +# /dev/sdy /dev/sdz /dev/sdaa /dev/sdab /dev/sdac /dev/sdad \ +# /dev/sdae /dev/sdaf /dev/sdag /dev/sdah /dev/sdai /dev/sdaj \ +# /dev/sdak /dev/sdal /dev/sdam /dev/sdan /dev/sdao /dev/sdap \ +# /dev/sdaq /dev/sdar /dev/sdas /dev/sdat /dev/sdau /dev/sdav" + +# Sun style disk in Thumper config +#DEVICES="/dev/sda /dev/sdb /dev/sdc \ +# /dev/sdi /dev/sdj /dev/sdk \ +# /dev/sdr /dev/sds /dev/sdt \ +# /dev/sdz /dev/sdaa /dev/sdab" + +# Promise JBOD config (ilc23) +#DEVICES="/dev/sdb /dev/sdc /dev/sdd \ +# /dev/sde /dev/sdf /dev/sdg \ +# /dev/sdh /dev/sdi /dev/sdj \ +# /dev/sdk /dev/sdl /dev/sdm" + +echo +echo "zpool create lustre " +${CMDDIR}/zpool/zpool create -F lustre ${DEVICES} + +echo +echo "zpool list" +${CMDDIR}/zpool/zpool list + +echo +echo "zpool status lustre" +${CMDDIR}/zpool/zpool status lustre + diff --git a/scripts/load-zfs.sh b/scripts/load-zfs.sh new file mode 100755 index 0000000000..6ba111b1d0 --- /dev/null +++ b/scripts/load-zfs.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +prog=load-zfs.sh +. ../.script-config + +spl_options=$1 +zpool_options=$2 + +spl_module=${SPLBUILD}/modules/spl/spl.ko +zlib_module=/lib/modules/${KERNELSRCVER}/kernel/lib/zlib_deflate/zlib_deflate.ko +zavl_module=${ZFSBUILD}/lib/libavl/zavl.ko +znvpair_module=${ZFSBUILD}/lib/libnvpair/znvpair.ko +zport_module=${ZFSBUILD}/lib/libport/zport.ko +zcommon_module=${ZFSBUILD}/lib/libzcommon/zcommon.ko +zpool_module=${ZFSBUILD}/lib/libzpool/zpool.ko +zctl_module=${ZFSBUILD}/lib/libdmu-ctl/zctl.ko +zpios_module=${ZFSBUILD}/lib/libzpios/zpios.ko + +die() { + echo "${prog}: $1" >&2 + exit 1 +} + +load_module() { + echo "Loading $1" + /sbin/insmod $* || die "Failed to load $1" +} + +if [ $(id -u) != 0 ]; then + die "Must run as root" +fi + +if /sbin/lsmod | egrep -q "^spl|^zavl|^znvpair|^zport|^zcommon|^zlib_deflate|^zpool"; then + die "Must start with modules unloaded" +fi + +if [ ! -f ${zavl_module} ] || + [ ! -f ${znvpair_module} ] || + [ ! -f ${zport_module} ] || + [ ! -f ${zcommon_module} ] || + [ ! -f ${zpool_module} ]; then + die "Source tree must be built, run 'make'" +fi + +load_module ${spl_module} ${spl_options} +load_module ${zlib_module} +load_module ${zavl_module} +load_module ${znvpair_module} +load_module ${zport_module} +load_module ${zcommon_module} +load_module ${zpool_module} ${zpool_options} +load_module ${zctl_module} +load_module ${zpios_module} + +sleep 1 +echo "Successfully loaded ZFS module stack" + +exit 0 diff --git a/scripts/profile-kpios-disk.sh b/scripts/profile-kpios-disk.sh new file mode 100755 index 0000000000..5d691b8a32 --- /dev/null +++ b/scripts/profile-kpios-disk.sh @@ -0,0 +1,128 @@ +#!/bin/bash +# profile-kpios-disk.sh +# +# /proc/diskinfo +# Field 1 -- device name +# Field 2 -- # of reads issued +# Field 3 -- # of reads merged +# Field 4 -- # of sectors read +# Field 5 -- # of milliseconds spent reading +# Field 6 -- # of writes completed +# Field 7 -- # of writes merged +# Field 8 -- # of sectors written +# Field 9 -- # of milliseconds spent writing +# Field 10 -- # of I/Os currently in progress +# Field 11 -- # of milliseconds spent doing I/Os +# Field 12 -- weighted # of milliseconds spent doing I/Os + +RUN_PIDS=${0} +RUN_LOG_DIR=${1} +RUN_ID=${2} + +create_table() { + local FIELD=$1 + local ROW_M=() + local ROW_N=() + local HEADER=1 + local STEP=1 + + for DISK_FILE in `ls -r --sort=time --time=ctime ${RUN_LOG_DIR}/${RUN_ID}/disk-[0-9]*`; do + ROW_M=( ${ROW_N[@]} ) + ROW_N=( `cat ${DISK_FILE} | grep sd | cut -c11- | cut -f${FIELD} -d' ' | tr "\n" "\t"` ) + + if [ $HEADER -eq 1 ]; then + echo -n "step, " + cat ${DISK_FILE} | grep sd | cut -c11- | cut -f1 -d' ' | tr "\n" ", " + echo "total" + HEADER=0 + fi + + if [ ${#ROW_M[@]} -eq 0 ]; then + continue + fi + + if [ ${#ROW_M[@]} -ne ${#ROW_N[@]} ]; then + echo "Badly formatted profile data in ${DISK_FILE}" + break + fi + + TOTAL=0 + echo -n "${STEP}, " + for (( i=0; i<${#ROW_N[@]}; i++ )); do + DELTA=`echo "${ROW_N[${i}]}-${ROW_M[${i}]}" | bc` + let TOTAL=${TOTAL}+${DELTA} + echo -n "${DELTA}, " + done + echo "${TOTAL}, " + + let STEP=${STEP}+1 + done +} + +create_table_mbs() { + local FIELD=$1 + local TIME=$2 + local ROW_M=() + local ROW_N=() + local HEADER=1 + local STEP=1 + + for DISK_FILE in `ls -r --sort=time --time=ctime ${RUN_LOG_DIR}/${RUN_ID}/disk-[0-9]*`; do + ROW_M=( ${ROW_N[@]} ) + ROW_N=( `cat ${DISK_FILE} | grep sd | cut -c11- | cut -f${FIELD} -d' ' | tr "\n" "\t"` ) + + if [ $HEADER -eq 1 ]; then + echo -n "step, " + cat ${DISK_FILE} | grep sd | cut -c11- | cut -f1 -d' ' | tr "\n" ", " + echo "total" + HEADER=0 + fi + + if [ ${#ROW_M[@]} -eq 0 ]; then + continue + fi + + if [ ${#ROW_M[@]} -ne ${#ROW_N[@]} ]; then + echo "Badly formatted profile data in ${DISK_FILE}" + break + fi + + TOTAL=0 + echo -n "${STEP}, " + for (( i=0; i<${#ROW_N[@]}; i++ )); do + DELTA=`echo "${ROW_N[${i}]}-${ROW_M[${i}]}" | bc` + MBS=`echo "scale=2; ((${DELTA}*512)/${TIME})/(1024*1024)" | bc` + TOTAL=`echo "scale=2; ${TOTAL}+${MBS}" | bc` + echo -n "${MBS}, " + done + echo "${TOTAL}, " + + let STEP=${STEP}+1 + done +} + +echo +echo "Reads issued per device" +create_table 2 +echo +echo "Reads merged per device" +create_table 3 +echo +echo "Sectors read per device" +create_table 4 +echo "MB/s per device" +create_table_mbs 4 3 + +echo +echo "Writes issued per device" +create_table 6 +echo +echo "Writes merged per device" +create_table 7 +echo +echo "Sectors written per device" +create_table 8 +echo "MB/s per device" +create_table_mbs 8 3 + +exit 0 diff --git a/scripts/profile-kpios-pids.sh b/scripts/profile-kpios-pids.sh new file mode 100755 index 0000000000..d3fa10c913 --- /dev/null +++ b/scripts/profile-kpios-pids.sh @@ -0,0 +1,130 @@ +#!/bin/bash +# profile-kpios-pids.sh + +RUN_PIDS=${0} +RUN_LOG_DIR=${1} +RUN_ID=${2} + +ROW_M=() +ROW_N=() +ROW_N_SCHED=() +ROW_N_WAIT=() + +HEADER=1 +STEP=1 + +for PID_FILE in `ls -r --sort=time --time=ctime ${RUN_LOG_DIR}/${RUN_ID}/pids-[0-9]*`; do + ROW_M=( ${ROW_N[@]} ) + ROW_N=( 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ) + ROW_N_SCHED=( `cat ${PID_FILE} | cut -f15 -d' ' | tr "\n" "\t"` ) + ROW_N_WAIT=( `cat ${PID_FILE} | cut -f17 -d' ' | tr "\n" "\t"` ) + ROW_N_NAMES=( `cat ${PID_FILE} | cut -f2 -d' ' | cut -f2 -d'(' | + cut -f1 -d')' | cut -f1 -d'/' | tr "\n" "\t"` ) + + for (( i=0; i<${#ROW_N_SCHED[@]}; i++ )); do + SUM=`echo "${ROW_N_WAIT[${i}]}+${ROW_N_SCHED[${i}]}" | bc` + + case ${ROW_N_NAMES[${i}]} in + zio_taskq) IDX=0;; + zio_req_nul) IDX=1;; + zio_irq_nul) IDX=2;; + zio_req_rd) IDX=3;; + zio_irq_rd) IDX=4;; + zio_req_wr) IDX=5;; + zio_irq_wr) IDX=6;; + zio_req_fr) IDX=7;; + zio_irq_fr) IDX=8;; + zio_req_cm) IDX=9;; + zio_irq_cm) IDX=10;; + zio_req_ctl) IDX=11;; + zio_irq_ctl) IDX=12;; + txg_quiesce) IDX=13;; + txg_sync) IDX=14;; + txg_timelimit) IDX=15;; + arc_reclaim) IDX=16;; + l2arc_feed) IDX=17;; + kpios_io) IDX=18;; + *) continue;; + esac + + let ROW_N[${IDX}]=${ROW_N[${IDX}]}+${SUM} + done + + if [ $HEADER -eq 1 ]; then + echo "step, zio_taskq, zio_req_nul, zio_irq_nul, " \ + "zio_req_rd, zio_irq_rd, zio_req_wr, zio_irq_wr, " \ + "zio_req_fr, zio_irq_fr, zio_req_cm, zio_irq_cm, " \ + "zio_req_ctl, zio_irq_ctl, txg_quiesce, txg_sync, " \ + "txg_timelimit, arc_reclaim, l2arc_feed, kpios_io, " \ + "idle" + HEADER=0 + fi + + if [ ${#ROW_M[@]} -eq 0 ]; then + continue + fi + + if [ ${#ROW_M[@]} -ne ${#ROW_N[@]} ]; then + echo "Badly formatted profile data in ${PID_FILE}" + break + fi + + # Original values are in jiffies and we expect HZ to be 1000 + # on most 2.6 systems thus we divide by 10 to get a percentage. + IDLE=1000 + echo -n "${STEP}, " + for (( i=0; i<${#ROW_N[@]}; i++ )); do + DELTA=`echo "${ROW_N[${i}]}-${ROW_M[${i}]}" | bc` + DELTA_PERCENT=`echo "scale=1; ${DELTA}/10" | bc` + let IDLE=${IDLE}-${DELTA} + echo -n "${DELTA_PERCENT}, " + done + ILDE_PERCENT=`echo "scale=1; ${IDLE}/10" | bc` + echo "${ILDE_PERCENT}" + + let STEP=${STEP}+1 +done + +exit + +echo +echo "Percent of total system time per pid" +for PID_FILE in `ls -r --sort=time --time=ctime ${RUN_LOG_DIR}/${RUN_ID}/pids-[0-9]*`; do + ROW_M=( ${ROW_N[@]} ) + ROW_N_SCHED=( `cat ${PID_FILE} | cut -f15 -d' ' | tr "\n" "\t"` ) + ROW_N_WAIT=( `cat ${PID_FILE} | cut -f17 -d' ' | tr "\n" "\t"` ) + + for (( i=0; i<${#ROW_N_SCHED[@]}; i++ )); do + ROW_N[${i}]=`echo "${ROW_N_WAIT[${i}]}+${ROW_N_SCHED[${i}]}" | bc` + done + + if [ $HEADER -eq 1 ]; then + echo -n "step, " + cat ${PID_FILE} | cut -f2 -d' ' | tr "\n" ", " + echo + HEADER=0 + fi + + if [ ${#ROW_M[@]} -eq 0 ]; then + continue + fi + + if [ ${#ROW_M[@]} -ne ${#ROW_N[@]} ]; then + echo "Badly formatted profile data in ${PID_FILE}" + break + fi + + # Original values are in jiffies and we expect HZ to be 1000 + # on most 2.6 systems thus we divide by 10 to get a percentage. + echo -n "${STEP}, " + for (( i=0; i<${#ROW_N[@]}; i++ )); do + DELTA=`echo "scale=1; (${ROW_N[${i}]}-${ROW_M[${i}]})/10" | bc` + echo -n "${DELTA}, " + done + + echo + let STEP=${STEP}+1 +done + + +exit 0 diff --git a/scripts/profile-kpios-post.sh b/scripts/profile-kpios-post.sh new file mode 100755 index 0000000000..74c0890a8a --- /dev/null +++ b/scripts/profile-kpios-post.sh @@ -0,0 +1,67 @@ +#!/bin/bash + +prog=profile-kpios-post.sh +. ../.script-config + +RUN_POST=${0} +RUN_PHASE=${1} +RUN_LOG_DIR=${2} +RUN_ID=${3} +RUN_POOL=${4} +RUN_CHUNK_SIZE=${5} +RUN_REGION_SIZE=${6} +RUN_THREAD_COUNT=${7} +RUN_REGION_COUNT=${8} +RUN_OFFSET=${9} +RUN_REGION_NOISE=${10} +RUN_CHUNK_NOISE=${11} +RUN_THREAD_DELAY=${12} +RUN_FLAGS=${13} +RUN_RESULT=${14} + +PROFILE_KPIOS_PIDS_BIN=/home/behlendo/src/zfs/scripts/profile-kpios-pids.sh +PROFILE_KPIOS_PIDS_LOG=${RUN_LOG_DIR}/${RUN_ID}/pids-summary.csv + +PROFILE_KPIOS_DISK_BIN=/home/behlendo/src/zfs/scripts/profile-kpios-disk.sh +PROFILE_KPIOS_DISK_LOG=${RUN_LOG_DIR}/${RUN_ID}/disk-summary.csv + +PROFILE_KPIOS_ARC_LOG=${RUN_LOG_DIR}/${RUN_ID}/arcstats +PROFILE_KPIOS_VDEV_LOG=${RUN_LOG_DIR}/${RUN_ID}/vdev_cache_stats + +KERNEL_BIN="/lib/modules/`uname -r`/kernel/" +SPL_BIN="${SPLBUILD}/modules/spl/" +ZFS_BIN="${ZFSBUILD}/lib/" + +OPROFILE_SHORT_ARGS="-a -g -l -p ${KERNEL_BIN},${SPL_BIN},${ZFS_BIN}" +OPROFILE_LONG_ARGS="-d -a -g -l -p ${KERNEL_BIN},${SPL_BIN},${ZFS_BIN}" + +OPROFILE_LOG=${RUN_LOG_DIR}/${RUN_ID}/oprofile.txt +OPROFILE_SHORT_LOG=${RUN_LOG_DIR}/${RUN_ID}/oprofile-short.txt +OPROFILE_LONG_LOG=${RUN_LOG_DIR}/${RUN_ID}/oprofile-long.txt +PROFILE_PID=${RUN_LOG_DIR}/${RUN_ID}/pid + +if [ "${RUN_PHASE}" != "post" ]; then + exit 1 +fi + +# opcontrol --stop >>${OPROFILE_LOG} 2>&1 +# opcontrol --dump >>${OPROFILE_LOG} 2>&1 + +kill -s SIGHUP `cat ${PROFILE_PID}` +rm -f ${PROFILE_PID} + +# opreport ${OPROFILE_SHORT_ARGS} >${OPROFILE_SHORT_LOG} 2>&1 +# opreport ${OPROFILE_LONG_ARGS} >${OPROFILE_LONG_LOG} 2>&1 + +# opcontrol --deinit >>${OPROFILE_LOG} 2>&1 + +cat /proc/spl/kstat/zfs/arcstats >${PROFILE_KPIOS_ARC_LOG} +cat /proc/spl/kstat/zfs/vdev_cache_stats >${PROFILE_KPIOS_VDEV_LOG} + +# Summarize system time per pid +${PROFILE_KPIOS_PIDS_BIN} ${RUN_LOG_DIR} ${RUN_ID} >${PROFILE_KPIOS_PIDS_LOG} + +# Summarize per device performance +${PROFILE_KPIOS_DISK_BIN} ${RUN_LOG_DIR} ${RUN_ID} >${PROFILE_KPIOS_DISK_LOG} + +exit 0 diff --git a/scripts/profile-kpios-pre.sh b/scripts/profile-kpios-pre.sh new file mode 100755 index 0000000000..1a2dcf0221 --- /dev/null +++ b/scripts/profile-kpios-pre.sh @@ -0,0 +1,69 @@ +#!/bin/bash +# profile-kpios-pre.sh + +trap "PROFILE_KPIOS_READY=1" SIGHUP + +RUN_PRE=${0} +RUN_PHASE=${1} +RUN_LOG_DIR=${2} +RUN_ID=${3} +RUN_POOL=${4} +RUN_CHUNK_SIZE=${5} +RUN_REGION_SIZE=${6} +RUN_THREAD_COUNT=${7} +RUN_REGION_COUNT=${8} +RUN_OFFSET=${9} +RUN_REGION_NOISE=${10} +RUN_CHUNK_NOISE=${11} +RUN_THREAD_DELAY=${12} +RUN_FLAGS=${13} +RUN_RESULT=${14} + +PROFILE_KPIOS_BIN=/home/behlendo/src/zfs/scripts/profile-kpios.sh +PROFILE_KPIOS_READY=0 + +OPROFILE_LOG=${RUN_LOG_DIR}/${RUN_ID}/oprofile.txt +PROFILE_PID=${RUN_LOG_DIR}/${RUN_ID}/pid +RUN_ARGS=${RUN_LOG_DIR}/${RUN_ID}/args + +if [ "${RUN_PHASE}" != "pre" ]; then + exit 1 +fi + +rm -Rf ${RUN_LOG_DIR}/${RUN_ID}/ +mkdir -p ${RUN_LOG_DIR}/${RUN_ID}/ + +echo "PHASE=${RUN_PHASE}" >>${RUN_ARGS} +echo "LOG_DIR=${RUN_LOG_DIR}" >>${RUN_ARGS} +echo "ID=${RUN_ID}" >>${RUN_ARGS} +echo "POOL=${RUN_POOL}" >>${RUN_ARGS} +echo "CHUNK_SIZE=${RUN_CHUNK_SIZE}" >>${RUN_ARGS} +echo "REGION_SIZE=${RUN_REGION_SIZE}" >>${RUN_ARGS} +echo "THREAD_COUNT=${RUN_THREAD_COUNT}" >>${RUN_ARGS} +echo "REGION_COUNT=${RUN_REGION_COUNT}" >>${RUN_ARGS} +echo "OFFSET=${RUN_OFFSET}" >>${RUN_ARGS} +echo "REGION_NOISE=${RUN_REGION_NOISE}" >>${RUN_ARGS} +echo "CHUNK_NOISE=${RUN_CHUNK_NOISE}" >>${RUN_ARGS} +echo "THREAD_DELAY=${RUN_THREAD_DELAY}" >>${RUN_ARGS} +echo "FLAGS=${RUN_FLAGS}" >>${RUN_ARGS} +echo "RESULT=${RUN_RESULT}" >>${RUN_ARGS} + +# XXX: Oprofile support seems to be broken when I try and start +# it via a user mode helper script, I suspect the setup is failing. +# opcontrol --init >>${OPROFILE_LOG} 2>&1 +# opcontrol --setup --vmlinux=/boot/vmlinux >>${OPROFILE_LOG} 2>&1 + +# Start the profile script +${PROFILE_KPIOS_BIN} ${RUN_PHASE} ${RUN_LOG_DIR} ${RUN_ID} & +echo "$!" >${PROFILE_PID} + +# Sleep waiting for profile script to be ready, it will +# signal us via SIGHUP when it is ready to start profiling. +while [ ${PROFILE_KPIOS_READY} -eq 0 ]; do + sleep 0.1 +done + +# opcontrol --start-daemon >>${OPROFILE_LOG} 2>&1 +# opcontrol --start >>${OPROFILE_LOG} 2>&1 + +exit 0 diff --git a/scripts/profile-kpios.sh b/scripts/profile-kpios.sh new file mode 100755 index 0000000000..88b9343f3e --- /dev/null +++ b/scripts/profile-kpios.sh @@ -0,0 +1,222 @@ +#!/bin/bash +# profile-kpios.sh + +trap "RUN_DONE=1" SIGHUP + +RUN_PHASE=${1} +RUN_LOG_DIR=${2} +RUN_ID=${3} +RUN_DONE=0 + +POLL_INTERVAL=2.99 + +# Log these pids, the exact pid numbers will vary from system to system +# so I harvest pid for all the following type of processes from /proc// +# +# zio_taskq/# +# spa_zio_issue/# +# spa_zio_intr/# +# txg_quiesce_thr +# txg_sync_thread +# txg_timelimit_t +# arc_reclaim_thr +# l2arc_feed_thre +# kpios_io/# + +ZIO_TASKQ_PIDS=() +ZIO_REQ_NUL_PIDS=() +ZIO_IRQ_NUL_PIDS=() +ZIO_REQ_RD_PIDS=() +ZIO_IRQ_RD_PIDS=() +ZIO_REQ_WR_PIDS=() +ZIO_IRQ_WR_PIDS=() +ZIO_REQ_FR_PIDS=() +ZIO_IRQ_FR_PIDS=() +ZIO_REQ_CM_PIDS=() +ZIO_IRQ_CM_PIDS=() +ZIO_REQ_CTL_PIDS=() +ZIO_IRQ_CTL_PIDS=() + +TXG_QUIESCE_PIDS=() +TXG_SYNC_PIDS=() +TXG_TIMELIMIT_PIDS=() + +ARC_RECLAIM_PIDS=() +L2ARC_FEED_PIDS=() + +KPIOS_IO_PIDS=() + +show_pids() { + echo "* zio_taskq: { ${ZIO_TASKQ_PIDS[@]} } = ${#ZIO_TASKQ_PIDS[@]}" + echo "* zio_req_nul: { ${ZIO_REQ_NUL_PIDS[@]} } = ${#ZIO_REQ_NUL_PIDS[@]}" + echo "* zio_irq_nul: { ${ZIO_IRQ_NUL_PIDS[@]} } = ${#ZIO_IRQ_NUL_PIDS[@]}" + echo "* zio_req_rd: { ${ZIO_REQ_RD_PIDS[@]} } = ${#ZIO_REQ_RD_PIDS[@]}" + echo "* zio_irq_rd: { ${ZIO_IRQ_RD_PIDS[@]} } = ${#ZIO_IRQ_RD_PIDS[@]}" + echo "* zio_req_wr: { ${ZIO_REQ_WR_PIDS[@]} } = ${#ZIO_REQ_WR_PIDS[@]}" + echo "* zio_irq_wr: { ${ZIO_IRQ_WR_PIDS[@]} } = ${#ZIO_IRQ_WR_PIDS[@]}" + echo "* zio_req_fr: { ${ZIO_REQ_FR_PIDS[@]} } = ${#ZIO_REQ_FR_PIDS[@]}" + echo "* zio_irq_fr: { ${ZIO_IRQ_FR_PIDS[@]} } = ${#ZIO_IRQ_FR_PIDS[@]}" + echo "* zio_req_cm: { ${ZIO_REQ_CM_PIDS[@]} } = ${#ZIO_REQ_CM_PIDS[@]}" + echo "* zio_irq_cm: { ${ZIO_IRQ_CM_PIDS[@]} } = ${#ZIO_IRQ_CM_PIDS[@]}" + echo "* zio_req_ctl: { ${ZIO_REQ_CTL_PIDS[@]} } = ${#ZIO_REQ_CTL_PIDS[@]}" + echo "* zio_irq_ctl: { ${ZIO_IRQ_CTL_PIDS[@]} } = ${#ZIO_IRQ_CTL_PIDS[@]}" + echo "* txg_quiesce: { ${TXG_QUIESCE_PIDS[@]} } = ${#TXG_QUIESCE_PIDS[@]}" + echo "* txg_sync: { ${TXG_SYNC_PIDS[@]} } = ${#TXG_SYNC_PIDS[@]}" + echo "* txg_timelimit: { ${TXG_TIMELIMIT_PIDS[@]} } = ${#TXG_TIMELIMIT_PIDS[@]}" + echo "* arc_reclaim: { ${ARC_RECLAIM_PIDS[@]} } = ${#ARC_RECLAIM_PIDS[@]}" + echo "* l2arc_feed: { ${L2ARC_FEED_PIDS[@]} } = ${#L2ARC_FEED_PIDS[@]}" + echo "* kpios_io: { ${KPIOS_IO_PIDS[@]} } = ${#KPIOS_IO_PIDS[@]}" +} + +check_pid() { + local PID=$1 + local NAME=$2 + local TYPE=$3 + local PIDS=( "$4" ) + local NAME_STRING=`echo ${NAME} | cut -f1 -d'/'` + local NAME_NUMBER=`echo ${NAME} | cut -f2 -d'/'` + + if [ "${NAME_STRING}" == "${TYPE}" ]; then + if [ -n "${NAME_NUMBER}" ]; then + PIDS[${NAME_NUMBER}]=${PID} + else + PIDS[${#PIDS[@]}]=${PID} + + fi + fi + + echo "${PIDS[@]}" +} + +# NOTE: This whole process is crazy slow but it will do for now +aquire_pids() { + echo "--- Aquiring ZFS pids ---" + + for PID in `ls /proc/ | grep [0-9] | sort -n -u`; do + if [ ! -e /proc/${PID}/status ]; then + continue + fi + + NAME=`cat /proc/${PID}/status | head -n1 | cut -f2` + + ZIO_TASKQ_PIDS=( `check_pid ${PID} ${NAME} "zio_taskq" \ + "$(echo "${ZIO_TASKQ_PIDS[@]}")"` ) + + ZIO_REQ_NUL_PIDS=( `check_pid ${PID} ${NAME} "zio_req_nul" \ + "$(echo "${ZIO_REQ_NUL_PIDS[@]}")"` ) + + ZIO_IRQ_NUL_PIDS=( `check_pid ${PID} ${NAME} "zio_irq_nul" \ + "$(echo "${ZIO_IRQ_NUL_PIDS[@]}")"` ) + + ZIO_REQ_RD_PIDS=( `check_pid ${PID} ${NAME} "zio_req_rd" \ + "$(echo "${ZIO_REQ_RD_PIDS[@]}")"` ) + + ZIO_IRQ_RD_PIDS=( `check_pid ${PID} ${NAME} "zio_irq_rd" \ + "$(echo "${ZIO_IRQ_RD_PIDS[@]}")"` ) + + ZIO_REQ_WR_PIDS=( `check_pid ${PID} ${NAME} "zio_req_wr" \ + "$(echo "${ZIO_REQ_WR_PIDS[@]}")"` ) + + ZIO_IRQ_WR_PIDS=( `check_pid ${PID} ${NAME} "zio_irq_wr" \ + "$(echo "${ZIO_IRQ_WR_PIDS[@]}")"` ) + + ZIO_REQ_FR_PIDS=( `check_pid ${PID} ${NAME} "zio_req_fr" \ + "$(echo "${ZIO_REQ_FR_PIDS[@]}")"` ) + + ZIO_IRQ_FR_PIDS=( `check_pid ${PID} ${NAME} "zio_irq_fr" \ + "$(echo "${ZIO_IRQ_FR_PIDS[@]}")"` ) + + ZIO_REQ_CM_PIDS=( `check_pid ${PID} ${NAME} "zio_req_cm" \ + "$(echo "${ZIO_REQ_CM_PIDS[@]}")"` ) + + ZIO_IRQ_CM_PIDS=( `check_pid ${PID} ${NAME} "zio_irq_cm" \ + "$(echo "${ZIO_IRQ_CM_PIDS[@]}")"` ) + + ZIO_REQ_CTL_PIDS=( `check_pid ${PID} ${NAME} "zio_req_ctl" \ + "$(echo "${ZIO_REQ_CTL_PIDS[@]}")"` ) + + ZIO_IRQ_CTL_PIDS=( `check_pid ${PID} ${NAME} "zio_irq_ctl" \ + "$(echo "${ZIO_IRQ_CTL_PIDS[@]}")"` ) + + TXG_QUIESCE_PIDS=( `check_pid ${PID} ${NAME} "txg_quiesce" \ + "$(echo "${TXG_QUIESCE_PIDS[@]}")"` ) + + TXG_SYNC_PIDS=( `check_pid ${PID} ${NAME} "txg_sync" \ + "$(echo "${TXG_SYNC_PIDS[@]}")"` ) + + TXG_TIMELIMIT_PIDS=( `check_pid ${PID} ${NAME} "txg_timelimit" \ + "$(echo "${TXG_TIMELIMIT_PIDS[@]}")"` ) + + ARC_RECLAIM_PIDS=( `check_pid ${PID} ${NAME} "arc_reclaim" \ + "$(echo "${ARC_RECLAIM_PIDS[@]}")"` ) + + L2ARC_FEED_PIDS=( `check_pid ${PID} ${NAME} "l2arc_feed" \ + "$(echo "${L2ARC_FEED_PIDS[@]}")"` ) + done + + # Wait for kpios_io threads to start + kill -s SIGHUP ${PPID} + echo "* Waiting for kpios_io threads to start" + while [ ${RUN_DONE} -eq 0 ]; do + KPIOS_IO_PIDS=( `ps ax | grep kpios_io | grep -v grep | \ + sed 's/^ *//g' | cut -f1 -d' '` ) + if [ ${#KPIOS_IO_PIDS[@]} -gt 0 ]; then + break; + fi + sleep 0.1 + done + + echo "`show_pids`" >${RUN_LOG_DIR}/${RUN_ID}/pids.txt +} + +log_pids() { + echo "--- Logging ZFS profile to ${RUN_LOG_DIR}/${RUN_ID}/ ---" + ALL_PIDS=( ${ZIO_TASKQ_PIDS[@]} \ + ${ZIO_REQ_NUL_PIDS[@]} \ + ${ZIO_IRQ_NUL_PIDS[@]} \ + ${ZIO_REQ_RD_PID[@]} \ + ${ZIO_IRQ_RD_PIDS[@]} \ + ${ZIO_REQ_WR_PIDS[@]} \ + ${ZIO_IRQ_WR_PIDS[@]} \ + ${ZIO_REQ_FR_PIDS[@]} \ + ${ZIO_IRQ_FR_PIDS[@]} \ + ${ZIO_REQ_CM_PIDS[@]} \ + ${ZIO_IRQ_CM_PIDS[@]} \ + ${ZIO_REQ_CTL_PIDS[@]} \ + ${ZIO_IRQ_CTL_PIDS[@]} \ + ${TXG_QUIESCE_PIDS[@]} \ + ${TXG_SYNC_PIDS[@]} \ + ${TXG_TIMELIMIT_PIDS[@]} \ + ${ARC_RECLAIM_PIDS[@]} \ + ${L2ARC_FEED_PIDS[@]} \ + ${KPIOS_IO_PIDS[@]} ) + + while [ ${RUN_DONE} -eq 0 ]; do + NOW=`date +%s.%N` + LOG_PIDS="${RUN_LOG_DIR}/${RUN_ID}/pids-${NOW}" + LOG_DISK="${RUN_LOG_DIR}/${RUN_ID}/disk-${NOW}" + + for PID in "${ALL_PIDS[@]}"; do + if [ -z ${PID} ]; then + continue; + fi + + if [ -e /proc/${PID}/stat ]; then + cat /proc/${PID}/stat | head -n1 >>${LOG_PIDS} + else + echo "<${PID} exited>" >>${LOG_PIDS} + fi + done + + cat /proc/diskstats >${LOG_DISK} + + NOW2=`date +%s.%N` + DELTA=`echo "${POLL_INTERVAL}-(${NOW2}-${NOW})" | bc` + sleep ${DELTA} + done +} + +aquire_pids +log_pids + +exit 0 diff --git a/scripts/survey.sh b/scripts/survey.sh new file mode 100755 index 0000000000..9198a418f3 --- /dev/null +++ b/scripts/survey.sh @@ -0,0 +1,102 @@ +#!/bin/bash + +prog=survey.sh +. ../.script-config + +LOG=/home/`whoami`/zpios-logs/`uname -r`/kpios-`date +%Y%m%d`/ +mkdir -p ${LOG} + +# Apply all tunings described below to generate some best case +# numbers for what is acheivable with some more elbow grease. +NAME="prefetch+zerocopy+checksum+pending1024+kmem" +echo "----------------------- ${NAME} ------------------------------" +./zpios.sh \ + "" \ + "zfs_prefetch_disable=1 zfs_vdev_max_pending=1024 zio_bulk_flags=0x100" \ + "--zerocopy" \ + ${LOG}/${NAME}/ \ + "${CMDDIR}/zfs/zfs set checksum=off lustre" | \ + tee ${LOG}/${NAME}.txt + +# Baseline number for an out of the box config with no manual tuning. +# Ideally, we will want things to be automatically tuned and for this +# number to approach the tweaked out results above. +NAME="baseline" +echo "----------------------- ${NAME} ------------------------------" +./zpios.sh \ + "" \ + "" \ + "" \ + ${LOG}/${NAME}/ | \ + tee ${LOG}/${NAME}.txt + +# Disable ZFS's prefetching. For some reason still not clear to me +# current prefetching policy is quite bad for a random workload. +# Allow the algorithm to detect a random workload and not do anything +# may be the way to address this issue. +NAME="prefetch" +echo "----------------------- ${NAME} ------------------------------" +./zpios.sh \ + "" \ + "zfs_prefetch_disable=1" \ + "" \ + ${LOG}/${NAME}/ | \ + tee ${LOG}/${NAME}.txt + +# As expected, simulating a zerocopy IO path improves performance +# by freeing up lots of CPU which is wasted move data between buffers. +NAME="zerocopy" +echo "----------------------- ${NAME} ------------------------------" +./zpios.sh \ + "" \ + "" \ + "--zerocopy" \ + ${LOG}/${NAME}/ | \ + tee ${LOG}/${NAME}.txt + +# Disabling checksumming should show some (if small) improvement +# simply due to freeing up a modest amount of CPU. +NAME="checksum" +echo "----------------------- ${NAME} ------------------------------" +./zpios.sh \ + "" \ + "" \ + "" \ + ${LOG}/${NAME}/ \ + "${CMDDIR}/zfs/zfs set checksum=off lustre" | \ + tee ${LOG}/${NAME}.txt + +# Increasing the pending IO depth also seems to improve things likely +# at the expense of latency. This should be exported more because I'm +# seeing a much bigger impact there that I would have expected. There +# may be some low hanging fruit to be found here. +NAME="pending" +echo "----------------------- ${NAME} ------------------------------" +./zpios.sh \ + "" \ + "zfs_vdev_max_pending=1024" \ + "" \ + ${LOG}/${NAME}/ | \ + tee ${LOG}/${NAME}.txt + +# To avoid memory fragmentation issues our slab implementation can be +# based on a virtual address space. Interestingly, we take a pretty +# substantial performance penalty for this somewhere in the low level +# IO drivers. If we back the slab with kmem pages we see far better +# read performance numbers at the cost of memory fragmention and general +# system instability due to large allocations. This may be because of +# an optimization in the low level drivers due to the contigeous kmem +# based memory. This needs to be explained. The good news here is that +# with zerocopy interfaces added at the DMU layer we could gaurentee +# kmem based memory for a pool of pages. +# +# 0x100 = KMC_KMEM - Force kmem_* based slab +# 0x200 = KMC_VMEM - Force vmem_* based slab +NAME="kmem" +echo "----------------------- ${NAME} ------------------------------" +./zpios.sh \ + "" \ + "zio_bulk_flags=0x100" \ + "" \ + ${LOG}/${NAME}/ | \ + tee ${LOG}/${NAME}.txt diff --git a/scripts/unload-zfs.sh b/scripts/unload-zfs.sh new file mode 100755 index 0000000000..12e987bd3f --- /dev/null +++ b/scripts/unload-zfs.sh @@ -0,0 +1,55 @@ +#!/bin/bash + +prog=unload-zfs.sh +. ../.script-config + +spl_module=${SPLBUILD}/modules/spl/spl.ko +zlib_module=/lib/modules/${KERNELSRCVER}/kernel/lib/zlib_deflate/zlib_deflate.ko +zavl_module=${ZFSBUILD}/lib/libavl/zavl.ko +znvpair_module=${ZFSBUILD}/lib/libnvpair/znvpair.ko +zport_module=${ZFSBUILD}/lib/libport/zport.ko +zcommon_module=${ZFSBUILD}/lib/libzcommon/zcommon.ko +zpool_module=${ZFSBUILD}/lib/libzpool/zpool.ko +zctl_module=${ZFSBUILD}/lib/libdmu-ctl/zctl.ko +zpios_module=${ZFSBUILD}/lib/libzpios/zpios.ko + +die() { + echo "${prog}: $1" >&2 + exit 1 +} + +unload_module() { + echo "Unloading $1" + /sbin/rmmod $1 || die "Failed to unload $1" +} + +if [ $(id -u) != 0 ]; then + die "Must run as root" +fi + +unload_module ${zpios_module} +unload_module ${zctl_module} +unload_module ${zpool_module} +unload_module ${zcommon_module} +unload_module ${zport_module} +unload_module ${znvpair_module} +unload_module ${zavl_module} +unload_module ${zlib_module} + +# Set DUMP=1 to generate debug logs on unload +if [ -n "${DUMP}" ]; then + sysctl -w kernel.spl.debug.dump=1 + # This is racy, I don't like it, but for a helper script it will do. + SPL_LOG=`dmesg | tail -n 1 | cut -f5 -d' '` + ${SPLBUILD}/cmd/spl ${SPL_LOG} >${SPL_LOG}.log + echo + echo "Dumped debug log: ${SPL_LOG}.log" + tail -n1 ${SPL_LOG}.log + echo +fi + +unload_module ${spl_module} + +echo "Successfully unloaded ZFS module stack" + +exit 0 diff --git a/scripts/update-zfs.sh b/scripts/update-zfs.sh new file mode 100755 index 0000000000..2bcc19b18c --- /dev/null +++ b/scripts/update-zfs.sh @@ -0,0 +1,59 @@ +#!/bin/bash + +PROG=update-zfs.sh +ZFS_SRC=http://dlc.sun.com/osol/on/downloads/b89/on-src.tar.bz2 + +die() { + rm -Rf $SRC + echo "${PROG}: $1" >&2 + exit 1 +} + +DEST=`pwd` +if [ `basename $DEST` != "scripts" ]; then + die "Must be run from scripts directory" +fi + +SRC=`mktemp -d /tmp/zfs.XXXXXXXXXX` +DEST=`dirname $DEST` +DATE=`date +%Y%m%d%H%M%S` + +wget $ZFS_SRC + +echo "--- Updating ZFS source ---" +echo +echo "ZFS_REPO = $ZFS_REPO" +echo "ZFS_PATCH_REPO = $ZFS_PATCH_REPO" +echo "SRC = $SRC" +echo "DEST = $DEST" + +echo +echo "--- Cloning $ZFS_REPO ---" +cd $SRC || die "Failed to 'cd $SRC'" +hg clone $ZFS_REPO || die "Failed to clone $ZFS_REPO" + +echo +echo "--- Cloning $ZFS_PATCH_REPO ---" +hg clone $ZFS_PATCH_REPO patches || die "Failed to clone $ZFS_PATCH_REPO" + +echo +echo "--- Backing up existing files ---" +echo "$DEST/zfs -> $DEST/zfs.$DATE" +cp -Rf $DEST/zfs $DEST/zfs.$DATE || die "Failed to backup" +echo "$DEST/zfs_patches -> $DEST/zfs_patches.$DATE" +cp -Rf $DEST/zfs_patches $DEST/zfs_patches.$DATE || die "Failed to backup" + +echo +echo "--- Overwriting $DEST/zfs and $DEST/zfs_patches ---" +find $SRC/trunk/src/ -name SConstruct -type f -print | xargs /bin/rm -f +find $SRC/trunk/src/ -name SConscript -type f -print | xargs /bin/rm -f +find $SRC/trunk/src/ -name *.orig -type f -print | xargs /bin/rm -f +rm -f $SRC/trunk/src/myconfig.py +cp -Rf $SRC/trunk/src/* $DEST/zfs || die "Failed to overwrite" +cp -Rf $SRC/patches/*.patch $DEST/zfs_patches/patches/ || die "Failed to overwrite" +cp -f $SRC/patches/series $DEST/zfs_patches/series/zfs-lustre + +echo +echo "--- Removing $SRC ---" +rm -Rf $SRC + diff --git a/scripts/zpios-jbod.sh b/scripts/zpios-jbod.sh new file mode 100755 index 0000000000..4cb960fc23 --- /dev/null +++ b/scripts/zpios-jbod.sh @@ -0,0 +1,110 @@ +#!/bin/bash + +prog=zpios-jbod.sh +. ../.script-config + +SPL_OPTIONS=$1 +ZPOOL_OPTIONS=$2 +KPIOS_OPTIONS=$3 +PROFILE_KPIOS_LOGS=$4 +KPIOS_PRE=$5 +KPIOS_POST=$6 + +PROFILE_KPIOS_PRE=/home/behlendo/src/zfs/scripts/profile-kpios-pre.sh +PROFILE_KPIOS_POST=/home/behlendo/src/zfs/scripts/profile-kpios-post.sh + +echo ------------------------- ZFS TEST LOG --------------------------------- +echo -n "Date = "; date +echo -n "Kernel = "; uname -r +echo ------------------------------------------------------------------------ + +echo +./load-zfs.sh "${SPL_OPTIONS}" "${ZPOOL_OPTIONS}" + +sysctl -w kernel.spl.debug.mask=0 +sysctl -w kernel.spl.debug.subsystem=0 + +echo ---------------------- SPL Sysctl Tunings ------------------------------ +sysctl -A | grep spl +echo + +echo ------------------- SPL/ZPOOL Module Tunings --------------------------- +grep [0-9] /sys/module/spl/parameters/* +grep [0-9] /sys/module/zpool/parameters/* +echo + +DEVICES="/dev/sdn /dev/sdo /dev/sdp \ + /dev/sdq /dev/sdr /dev/sds \ + /dev/sdt /dev/sdu /dev/sdv \ + /dev/sdw /dev/sdx /dev/sdy" + +${CMDDIR}/zpool/zpool create -F lustre ${DEVICES} +${CMDDIR}/zpool/zpool status lustre + +if [ -n "${KPIOS_PRE}" ]; then + ${KPIOS_PRE} +fi + +# Usage: zpios +# --chunksize -c =values +# --chunksize_low -a =value +# --chunksize_high -b =value +# --chunksize_incr -g =value +# --offset -o =values +# --offset_low -m =value +# --offset_high -q =value +# --offset_incr -r =value +# --regioncount -n =values +# --regioncount_low -i =value +# --regioncount_high -j =value +# --regioncount_incr -k =value +# --threadcount -t =values +# --threadcount_low -l =value +# --threadcount_high -h =value +# --threadcount_incr -e =value +# --regionsize -s =values +# --regionsize_low -A =value +# --regionsize_high -B =value +# --regionsize_incr -C =value +# --cleanup -x +# --verify -V +# --zerocopy -z +# --threaddelay -T =jiffies +# --regionnoise -I =shift +# --chunknoise -N =bytes +# --prerun -P =pre-command +# --postrun -R =post-command +# --log -G =log directory +# --pool | --path -p =pool name +# --load -L =dmuio +# --help -? =this help +# --verbose -v =increase verbosity +# --threadcount=256,256,256,256,256 \ + +CMD="${CMDDIR}/zpios/zpios \ + --load=dmuio \ + --path=lustre \ + --chunksize=1M \ + --regionsize=4M \ + --regioncount=16384 \ + --threadcount=256 \ + --offset=4M \ + --cleanup \ + --verbose \ + --human-readable \ + ${KPIOS_OPTIONS} \ + --prerun=${PROFILE_KPIOS_PRE} \ + --postrun=${PROFILE_KPIOS_POST} \ + --log=${PROFILE_KPIOS_LOGS}" +echo +date +echo ${CMD} +$CMD +date + +if [ -n "${KPIOS_POST}" ]; then + ${KPIOS_POST} +fi + +${CMDDIR}/zpool/zpool destroy lustre +./unload-zfs.sh diff --git a/scripts/zpios.sh b/scripts/zpios.sh new file mode 100755 index 0000000000..a22a33c0d0 --- /dev/null +++ b/scripts/zpios.sh @@ -0,0 +1,139 @@ +#!/bin/bash + +prog=zpios.sh +. ../.script-config + +SPL_OPTIONS="spl_debug_mask=0 spl_debug_subsys=0 ${1}" +ZPOOL_OPTIONS=$2 +KPIOS_OPTIONS=$3 +PROFILE_KPIOS_LOGS=$4 +KPIOS_PRE=$5 +KPIOS_POST=$6 + +PROFILE_KPIOS_PRE=/home/behlendo/src/zfs/scripts/profile-kpios-pre.sh +PROFILE_KPIOS_POST=/home/behlendo/src/zfs/scripts/profile-kpios-post.sh + +echo ------------------------- ZFS TEST LOG --------------------------------- +echo -n "Date = "; date +echo -n "Kernel = "; uname -r +echo ------------------------------------------------------------------------ + +echo +./load-zfs.sh "${SPL_OPTIONS}" "${ZPOOL_OPTIONS}" + +echo ---------------------- SPL Sysctl Tunings ------------------------------ +sysctl -A | grep spl +echo + +echo ------------------- SPL/ZPOOL Module Tunings --------------------------- +if [ -d /sys/module/spl/parameters ]; then + grep [0-9] /sys/module/spl/parameters/* + grep [0-9] /sys/module/zpool/parameters/* +else + grep [0-9] /sys/module/spl/* + grep [0-9] /sys/module/zpool/* +fi +echo + +# LOCAL HACK +if [ `hostname` = "ilc23" ]; then + DEVICES="/dev/sdy /dev/sdo /dev/sdp /dev/sdq /dev/sdr /dev/sds \ + /dev/sdt /dev/sdu /dev/sdv /dev/sdw /dev/sdx" +else + DEVICES="/dev/hda" +fi + +echo "${CMDDIR}/zpool/zpool create -F lustre ${DEVICES}" +${CMDDIR}/zpool/zpool create -F lustre ${DEVICES} + +echo "${CMDDIR}/zpool/zpool status lustre" +${CMDDIR}/zpool/zpool status lustre + +echo "Waiting for /dev/kpios to come up..." +while [ ! -c /dev/kpios ]; do + sleep 1 +done + +if [ -n "${KPIOS_PRE}" ]; then + ${KPIOS_PRE} +fi + +# Usage: zpios +# --chunksize -c =values +# --chunksize_low -a =value +# --chunksize_high -b =value +# --chunksize_incr -g =value +# --offset -o =values +# --offset_low -m =value +# --offset_high -q =value +# --offset_incr -r =value +# --regioncount -n =values +# --regioncount_low -i =value +# --regioncount_high -j =value +# --regioncount_incr -k =value +# --threadcount -t =values +# --threadcount_low -l =value +# --threadcount_high -h =value +# --threadcount_incr -e =value +# --regionsize -s =values +# --regionsize_low -A =value +# --regionsize_high -B =value +# --regionsize_incr -C =value +# --cleanup -x +# --verify -V +# --zerocopy -z +# --threaddelay -T =jiffies +# --regionnoise -I =shift +# --chunknoise -N =bytes +# --prerun -P =pre-command +# --postrun -R =post-command +# --log -G =log directory +# --pool | --path -p =pool name +# --load -L =dmuio +# --help -? =this help +# --verbose -v =increase verbosity + +# --prerun=${PROFILE_KPIOS_PRE} \ +# --postrun=${PROFILE_KPIOS_POST} \ + +CMD="${CMDDIR}/zpios/zpios \ + --load=dmuio \ + --path=lustre \ + --chunksize=1M \ + --regionsize=4M \ + --regioncount=16384 \ + --threadcount=256,256,256,256,256 \ + --offset=4M \ + --cleanup \ + --verbose \ + --human-readable \ + ${KPIOS_OPTIONS} \ + --log=${PROFILE_KPIOS_LOGS}" +echo +date +echo ${CMD} +$CMD +date + +if [ -n "${KPIOS_POST}" ]; then + ${KPIOS_POST} +fi + +${CMDDIR}/zpool/zpool destroy lustre + +echo ---------------------- SPL Sysctl Tunings ------------------------------ +sysctl -A | grep spl +echo + +echo ------------------------ KSTAT Statistics ------------------------------ +echo ARCSTATS +cat /proc/spl/kstat/zfs/arcstats +echo +echo VDEV_CACHE_STATS +cat /proc/spl/kstat/zfs/vdev_cache_stats +echo +echo SLAB +cat /proc/spl/kmem/slab +echo + +./unload-zfs.sh diff --git a/zfs/Makefile.in b/zfs/Makefile.in new file mode 100644 index 0000000000..e0b30c8373 --- /dev/null +++ b/zfs/Makefile.in @@ -0,0 +1,16 @@ +subdir-m += lib +subdir-m += zcmd + +all: +# Make the exported SPL symbols available to this module. There +# is probably a better way to do this, but this will have to do +# for now... an option to modpost perhaps. + cp @splsymvers@ . + +# Kick off the kernel build system + $(MAKE) -C @LINUX@ SUBDIRS=`pwd` @KERNELMAKE_PARAMS@ modules + +install uninstall clean distclean maintainer-clean distdir: + $(MAKE) -C @LINUX@ SUBDIRS=`pwd` @KERNELMAKE_PARAMS@ $@ + +check: diff --git a/zfs/lib/Makefile.in b/zfs/lib/Makefile.in new file mode 100644 index 0000000000..324270db95 --- /dev/null +++ b/zfs/lib/Makefile.in @@ -0,0 +1,12 @@ +subdir-m += libuutil # User space util support +subdir-m += libumem # User space memory support +subdir-m += libzfs # User space library support +subdir-m += libsolcompat # User space compatibility library + +subdir-m += libzpool # Kernel DMU/SPA +subdir-m += libdmu-ctl # Kernel control interface + +subdir-m += libavl # Kernel + user space AVL tree support +subdir-m += libnvpair # Kernel + user space name/value support +subdir-m += libzcommon # Kernel + user space common support +subdir-m += libport # Kernel + user space linux support diff --git a/zfs/lib/libavl/Makefile.in b/zfs/lib/libavl/Makefile.in new file mode 100644 index 0000000000..6b9d4d5401 --- /dev/null +++ b/zfs/lib/libavl/Makefile.in @@ -0,0 +1,31 @@ +subdir-m += include +DISTFILES = avl.c + +MODULE := zavl +LIBRARY := libavl + +# Compile as kernel module. Needed symlinks created for all +# k* objects created by top level configure script. + +EXTRA_CFLAGS = @KERNELCPPFLAGS@ +EXTRA_CFLAGS += -I@LIBDIR@/libavl/include + +obj-m := ${MODULE}.o + +${MODULE}-objs += kavl.o # Generic AVL support + +# Compile as shared library. There's an extra useless host program +# here called 'zu' because it was the easiest way I could convince +# the kernel build system to construct a user space shared library. + +HOSTCFLAGS += @HOSTCFLAGS@ +HOSTCFLAGS += -I@LIBDIR@/libsolcompat/include +HOSTCFLAGS += -I@LIBDIR@/libport/include +HOSTCFLAGS += -I@LIBDIR@/libavl/include + +hostprogs-y := zu +always := $(hostprogs-y) + +zu-objs := zu.o ${LIBRARY}.so + +${LIBRARY}-objs += uavl.o diff --git a/zfs/lib/libavl/avl.c b/zfs/lib/libavl/avl.c new file mode 100644 index 0000000000..ff3ad52b78 --- /dev/null +++ b/zfs/lib/libavl/avl.c @@ -0,0 +1,969 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + + + +/* + * AVL - generic AVL tree implementation for kernel use + * + * A complete description of AVL trees can be found in many CS textbooks. + * + * Here is a very brief overview. An AVL tree is a binary search tree that is + * almost perfectly balanced. By "almost" perfectly balanced, we mean that at + * any given node, the left and right subtrees are allowed to differ in height + * by at most 1 level. + * + * This relaxation from a perfectly balanced binary tree allows doing + * insertion and deletion relatively efficiently. Searching the tree is + * still a fast operation, roughly O(log(N)). + * + * The key to insertion and deletion is a set of tree maniuplations called + * rotations, which bring unbalanced subtrees back into the semi-balanced state. + * + * This implementation of AVL trees has the following peculiarities: + * + * - The AVL specific data structures are physically embedded as fields + * in the "using" data structures. To maintain generality the code + * must constantly translate between "avl_node_t *" and containing + * data structure "void *"s by adding/subracting the avl_offset. + * + * - Since the AVL data is always embedded in other structures, there is + * no locking or memory allocation in the AVL routines. This must be + * provided for by the enclosing data structure's semantics. Typically, + * avl_insert()/_add()/_remove()/avl_insert_here() require some kind of + * exclusive write lock. Other operations require a read lock. + * + * - The implementation uses iteration instead of explicit recursion, + * since it is intended to run on limited size kernel stacks. Since + * there is no recursion stack present to move "up" in the tree, + * there is an explicit "parent" link in the avl_node_t. + * + * - The left/right children pointers of a node are in an array. + * In the code, variables (instead of constants) are used to represent + * left and right indices. The implementation is written as if it only + * dealt with left handed manipulations. By changing the value assigned + * to "left", the code also works for right handed trees. The + * following variables/terms are frequently used: + * + * int left; // 0 when dealing with left children, + * // 1 for dealing with right children + * + * int left_heavy; // -1 when left subtree is taller at some node, + * // +1 when right subtree is taller + * + * int right; // will be the opposite of left (0 or 1) + * int right_heavy;// will be the opposite of left_heavy (-1 or 1) + * + * int direction; // 0 for "<" (ie. left child); 1 for ">" (right) + * + * Though it is a little more confusing to read the code, the approach + * allows using half as much code (and hence cache footprint) for tree + * manipulations and eliminates many conditional branches. + * + * - The avl_index_t is an opaque "cookie" used to find nodes at or + * adjacent to where a new value would be inserted in the tree. The value + * is a modified "avl_node_t *". The bottom bit (normally 0 for a + * pointer) is set to indicate if that the new node has a value greater + * than the value of the indicated "avl_node_t *". + */ + +#include +#include +#include +#include +#include + +/* + * Small arrays to translate between balance (or diff) values and child indeces. + * + * Code that deals with binary tree data structures will randomly use + * left and right children when examining a tree. C "if()" statements + * which evaluate randomly suffer from very poor hardware branch prediction. + * In this code we avoid some of the branch mispredictions by using the + * following translation arrays. They replace random branches with an + * additional memory reference. Since the translation arrays are both very + * small the data should remain efficiently in cache. + */ +static const int avl_child2balance[2] = {-1, 1}; +static const int avl_balance2child[] = {0, 0, 1}; + + +/* + * Walk from one node to the previous valued node (ie. an infix walk + * towards the left). At any given node we do one of 2 things: + * + * - If there is a left child, go to it, then to it's rightmost descendant. + * + * - otherwise we return thru parent nodes until we've come from a right child. + * + * Return Value: + * NULL - if at the end of the nodes + * otherwise next node + */ +void * +avl_walk(avl_tree_t *tree, void *oldnode, int left) +{ + size_t off = tree->avl_offset; + avl_node_t *node = AVL_DATA2NODE(oldnode, off); + int right = 1 - left; + int was_child; + + + /* + * nowhere to walk to if tree is empty + */ + if (node == NULL) + return (NULL); + + /* + * Visit the previous valued node. There are two possibilities: + * + * If this node has a left child, go down one left, then all + * the way right. + */ + if (node->avl_child[left] != NULL) { + for (node = node->avl_child[left]; + node->avl_child[right] != NULL; + node = node->avl_child[right]) + ; + /* + * Otherwise, return thru left children as far as we can. + */ + } else { + for (;;) { + was_child = AVL_XCHILD(node); + node = AVL_XPARENT(node); + if (node == NULL) + return (NULL); + if (was_child == right) + break; + } + } + + return (AVL_NODE2DATA(node, off)); +} + +/* + * Return the lowest valued node in a tree or NULL. + * (leftmost child from root of tree) + */ +void * +avl_first(avl_tree_t *tree) +{ + avl_node_t *node; + avl_node_t *prev = NULL; + size_t off = tree->avl_offset; + + for (node = tree->avl_root; node != NULL; node = node->avl_child[0]) + prev = node; + + if (prev != NULL) + return (AVL_NODE2DATA(prev, off)); + return (NULL); +} + +/* + * Return the highest valued node in a tree or NULL. + * (rightmost child from root of tree) + */ +void * +avl_last(avl_tree_t *tree) +{ + avl_node_t *node; + avl_node_t *prev = NULL; + size_t off = tree->avl_offset; + + for (node = tree->avl_root; node != NULL; node = node->avl_child[1]) + prev = node; + + if (prev != NULL) + return (AVL_NODE2DATA(prev, off)); + return (NULL); +} + +/* + * Access the node immediately before or after an insertion point. + * + * "avl_index_t" is a (avl_node_t *) with the bottom bit indicating a child + * + * Return value: + * NULL: no node in the given direction + * "void *" of the found tree node + */ +void * +avl_nearest(avl_tree_t *tree, avl_index_t where, int direction) +{ + int child = AVL_INDEX2CHILD(where); + avl_node_t *node = AVL_INDEX2NODE(where); + void *data; + size_t off = tree->avl_offset; + + if (node == NULL) { + ASSERT(tree->avl_root == NULL); + return (NULL); + } + data = AVL_NODE2DATA(node, off); + if (child != direction) + return (data); + + return (avl_walk(tree, data, direction)); +} + + +/* + * Search for the node which contains "value". The algorithm is a + * simple binary tree search. + * + * return value: + * NULL: the value is not in the AVL tree + * *where (if not NULL) is set to indicate the insertion point + * "void *" of the found tree node + */ +void * +avl_find(avl_tree_t *tree, void *value, avl_index_t *where) +{ + avl_node_t *node; + avl_node_t *prev = NULL; + int child = 0; + int diff; + size_t off = tree->avl_offset; + + for (node = tree->avl_root; node != NULL; + node = node->avl_child[child]) { + + prev = node; + + diff = tree->avl_compar(value, AVL_NODE2DATA(node, off)); + ASSERT(-1 <= diff && diff <= 1); + if (diff == 0) { +#ifdef DEBUG + if (where != NULL) + *where = 0; +#endif + return (AVL_NODE2DATA(node, off)); + } + child = avl_balance2child[1 + diff]; + + } + + if (where != NULL) + *where = AVL_MKINDEX(prev, child); + + return (NULL); +} + + +/* + * Perform a rotation to restore balance at the subtree given by depth. + * + * This routine is used by both insertion and deletion. The return value + * indicates: + * 0 : subtree did not change height + * !0 : subtree was reduced in height + * + * The code is written as if handling left rotations, right rotations are + * symmetric and handled by swapping values of variables right/left[_heavy] + * + * On input balance is the "new" balance at "node". This value is either + * -2 or +2. + */ +static int +avl_rotation(avl_tree_t *tree, avl_node_t *node, int balance) +{ + int left = !(balance < 0); /* when balance = -2, left will be 0 */ + int right = 1 - left; + int left_heavy = balance >> 1; + int right_heavy = -left_heavy; + avl_node_t *parent = AVL_XPARENT(node); + avl_node_t *child = node->avl_child[left]; + avl_node_t *cright; + avl_node_t *gchild; + avl_node_t *gright; + avl_node_t *gleft; + int which_child = AVL_XCHILD(node); + int child_bal = AVL_XBALANCE(child); + + /* BEGIN CSTYLED */ + /* + * case 1 : node is overly left heavy, the left child is balanced or + * also left heavy. This requires the following rotation. + * + * (node bal:-2) + * / \ + * / \ + * (child bal:0 or -1) + * / \ + * / \ + * cright + * + * becomes: + * + * (child bal:1 or 0) + * / \ + * / \ + * (node bal:-1 or 0) + * / \ + * / \ + * cright + * + * we detect this situation by noting that child's balance is not + * right_heavy. + */ + /* END CSTYLED */ + if (child_bal != right_heavy) { + + /* + * compute new balance of nodes + * + * If child used to be left heavy (now balanced) we reduced + * the height of this sub-tree -- used in "return...;" below + */ + child_bal += right_heavy; /* adjust towards right */ + + /* + * move "cright" to be node's left child + */ + cright = child->avl_child[right]; + node->avl_child[left] = cright; + if (cright != NULL) { + AVL_SETPARENT(cright, node); + AVL_SETCHILD(cright, left); + } + + /* + * move node to be child's right child + */ + child->avl_child[right] = node; + AVL_SETBALANCE(node, -child_bal); + AVL_SETCHILD(node, right); + AVL_SETPARENT(node, child); + + /* + * update the pointer into this subtree + */ + AVL_SETBALANCE(child, child_bal); + AVL_SETCHILD(child, which_child); + AVL_SETPARENT(child, parent); + if (parent != NULL) + parent->avl_child[which_child] = child; + else + tree->avl_root = child; + + return (child_bal == 0); + } + + /* BEGIN CSTYLED */ + /* + * case 2 : When node is left heavy, but child is right heavy we use + * a different rotation. + * + * (node b:-2) + * / \ + * / \ + * / \ + * (child b:+1) + * / \ + * / \ + * (gchild b: != 0) + * / \ + * / \ + * gleft gright + * + * becomes: + * + * (gchild b:0) + * / \ + * / \ + * / \ + * (child b:?) (node b:?) + * / \ / \ + * / \ / \ + * gleft gright + * + * computing the new balances is more complicated. As an example: + * if gchild was right_heavy, then child is now left heavy + * else it is balanced + */ + /* END CSTYLED */ + gchild = child->avl_child[right]; + gleft = gchild->avl_child[left]; + gright = gchild->avl_child[right]; + + /* + * move gright to left child of node and + * + * move gleft to right child of node + */ + node->avl_child[left] = gright; + if (gright != NULL) { + AVL_SETPARENT(gright, node); + AVL_SETCHILD(gright, left); + } + + child->avl_child[right] = gleft; + if (gleft != NULL) { + AVL_SETPARENT(gleft, child); + AVL_SETCHILD(gleft, right); + } + + /* + * move child to left child of gchild and + * + * move node to right child of gchild and + * + * fixup parent of all this to point to gchild + */ + balance = AVL_XBALANCE(gchild); + gchild->avl_child[left] = child; + AVL_SETBALANCE(child, (balance == right_heavy ? left_heavy : 0)); + AVL_SETPARENT(child, gchild); + AVL_SETCHILD(child, left); + + gchild->avl_child[right] = node; + AVL_SETBALANCE(node, (balance == left_heavy ? right_heavy : 0)); + AVL_SETPARENT(node, gchild); + AVL_SETCHILD(node, right); + + AVL_SETBALANCE(gchild, 0); + AVL_SETPARENT(gchild, parent); + AVL_SETCHILD(gchild, which_child); + if (parent != NULL) + parent->avl_child[which_child] = gchild; + else + tree->avl_root = gchild; + + return (1); /* the new tree is always shorter */ +} + + +/* + * Insert a new node into an AVL tree at the specified (from avl_find()) place. + * + * Newly inserted nodes are always leaf nodes in the tree, since avl_find() + * searches out to the leaf positions. The avl_index_t indicates the node + * which will be the parent of the new node. + * + * After the node is inserted, a single rotation further up the tree may + * be necessary to maintain an acceptable AVL balance. + */ +void +avl_insert(avl_tree_t *tree, void *new_data, avl_index_t where) +{ + avl_node_t *node; + avl_node_t *parent = AVL_INDEX2NODE(where); + int old_balance; + int new_balance; + int which_child = AVL_INDEX2CHILD(where); + size_t off = tree->avl_offset; + + ASSERT(tree); +#ifdef _LP64 + ASSERT(((uintptr_t)new_data & 0x7) == 0); +#endif + + node = AVL_DATA2NODE(new_data, off); + + /* + * First, add the node to the tree at the indicated position. + */ + ++tree->avl_numnodes; + + node->avl_child[0] = NULL; + node->avl_child[1] = NULL; + + AVL_SETCHILD(node, which_child); + AVL_SETBALANCE(node, 0); + AVL_SETPARENT(node, parent); + if (parent != NULL) { + ASSERT(parent->avl_child[which_child] == NULL); + parent->avl_child[which_child] = node; + } else { + ASSERT(tree->avl_root == NULL); + tree->avl_root = node; + } + /* + * Now, back up the tree modifying the balance of all nodes above the + * insertion point. If we get to a highly unbalanced ancestor, we + * need to do a rotation. If we back out of the tree we are done. + * If we brought any subtree into perfect balance (0), we are also done. + */ + for (;;) { + node = parent; + if (node == NULL) + return; + + /* + * Compute the new balance + */ + old_balance = AVL_XBALANCE(node); + new_balance = old_balance + avl_child2balance[which_child]; + + /* + * If we introduced equal balance, then we are done immediately + */ + if (new_balance == 0) { + AVL_SETBALANCE(node, 0); + return; + } + + /* + * If both old and new are not zero we went + * from -1 to -2 balance, do a rotation. + */ + if (old_balance != 0) + break; + + AVL_SETBALANCE(node, new_balance); + parent = AVL_XPARENT(node); + which_child = AVL_XCHILD(node); + } + + /* + * perform a rotation to fix the tree and return + */ + (void) avl_rotation(tree, node, new_balance); +} + +/* + * Insert "new_data" in "tree" in the given "direction" either after or + * before (AVL_AFTER, AVL_BEFORE) the data "here". + * + * Insertions can only be done at empty leaf points in the tree, therefore + * if the given child of the node is already present we move to either + * the AVL_PREV or AVL_NEXT and reverse the insertion direction. Since + * every other node in the tree is a leaf, this always works. + * + * To help developers using this interface, we assert that the new node + * is correctly ordered at every step of the way in DEBUG kernels. + */ +void +avl_insert_here( + avl_tree_t *tree, + void *new_data, + void *here, + int direction) +{ + avl_node_t *node; + int child = direction; /* rely on AVL_BEFORE == 0, AVL_AFTER == 1 */ +#ifdef DEBUG + int diff; +#endif + + ASSERT(tree != NULL); + ASSERT(new_data != NULL); + ASSERT(here != NULL); + ASSERT(direction == AVL_BEFORE || direction == AVL_AFTER); + + /* + * If corresponding child of node is not NULL, go to the neighboring + * node and reverse the insertion direction. + */ + node = AVL_DATA2NODE(here, tree->avl_offset); + +#ifdef DEBUG + diff = tree->avl_compar(new_data, here); + ASSERT(-1 <= diff && diff <= 1); + ASSERT(diff != 0); + ASSERT(diff > 0 ? child == 1 : child == 0); +#endif + + if (node->avl_child[child] != NULL) { + node = node->avl_child[child]; + child = 1 - child; + while (node->avl_child[child] != NULL) { +#ifdef DEBUG + diff = tree->avl_compar(new_data, + AVL_NODE2DATA(node, tree->avl_offset)); + ASSERT(-1 <= diff && diff <= 1); + ASSERT(diff != 0); + ASSERT(diff > 0 ? child == 1 : child == 0); +#endif + node = node->avl_child[child]; + } +#ifdef DEBUG + diff = tree->avl_compar(new_data, + AVL_NODE2DATA(node, tree->avl_offset)); + ASSERT(-1 <= diff && diff <= 1); + ASSERT(diff != 0); + ASSERT(diff > 0 ? child == 1 : child == 0); +#endif + } + ASSERT(node->avl_child[child] == NULL); + + avl_insert(tree, new_data, AVL_MKINDEX(node, child)); +} + +/* + * Add a new node to an AVL tree. + */ +void +avl_add(avl_tree_t *tree, void *new_node) +{ + avl_index_t where; + + /* + * This is unfortunate. We want to call panic() here, even for + * non-DEBUG kernels. In userland, however, we can't depend on anything + * in libc or else the rtld build process gets confused. So, all we can + * do in userland is resort to a normal ASSERT(). + */ + if (avl_find(tree, new_node, &where) != NULL) +#ifdef _KERNEL + panic("avl_find() succeeded inside avl_add()"); +#else + ASSERT(0); +#endif + avl_insert(tree, new_node, where); +} + +/* + * Delete a node from the AVL tree. Deletion is similar to insertion, but + * with 2 complications. + * + * First, we may be deleting an interior node. Consider the following subtree: + * + * d c c + * / \ / \ / \ + * b e b e b e + * / \ / \ / + * a c a a + * + * When we are deleting node (d), we find and bring up an adjacent valued leaf + * node, say (c), to take the interior node's place. In the code this is + * handled by temporarily swapping (d) and (c) in the tree and then using + * common code to delete (d) from the leaf position. + * + * Secondly, an interior deletion from a deep tree may require more than one + * rotation to fix the balance. This is handled by moving up the tree through + * parents and applying rotations as needed. The return value from + * avl_rotation() is used to detect when a subtree did not change overall + * height due to a rotation. + */ +void +avl_remove(avl_tree_t *tree, void *data) +{ + avl_node_t *delete; + avl_node_t *parent; + avl_node_t *node; + avl_node_t tmp; + int old_balance; + int new_balance; + int left; + int right; + int which_child; + size_t off = tree->avl_offset; + + ASSERT(tree); + + delete = AVL_DATA2NODE(data, off); + + /* + * Deletion is easiest with a node that has at most 1 child. + * We swap a node with 2 children with a sequentially valued + * neighbor node. That node will have at most 1 child. Note this + * has no effect on the ordering of the remaining nodes. + * + * As an optimization, we choose the greater neighbor if the tree + * is right heavy, otherwise the left neighbor. This reduces the + * number of rotations needed. + */ + if (delete->avl_child[0] != NULL && delete->avl_child[1] != NULL) { + + /* + * choose node to swap from whichever side is taller + */ + old_balance = AVL_XBALANCE(delete); + left = avl_balance2child[old_balance + 1]; + right = 1 - left; + + /* + * get to the previous value'd node + * (down 1 left, as far as possible right) + */ + for (node = delete->avl_child[left]; + node->avl_child[right] != NULL; + node = node->avl_child[right]) + ; + + /* + * create a temp placeholder for 'node' + * move 'node' to delete's spot in the tree + */ + tmp = *node; + + *node = *delete; + if (node->avl_child[left] == node) + node->avl_child[left] = &tmp; + + parent = AVL_XPARENT(node); + if (parent != NULL) + parent->avl_child[AVL_XCHILD(node)] = node; + else + tree->avl_root = node; + AVL_SETPARENT(node->avl_child[left], node); + AVL_SETPARENT(node->avl_child[right], node); + + /* + * Put tmp where node used to be (just temporary). + * It always has a parent and at most 1 child. + */ + delete = &tmp; + parent = AVL_XPARENT(delete); + parent->avl_child[AVL_XCHILD(delete)] = delete; + which_child = (delete->avl_child[1] != 0); + if (delete->avl_child[which_child] != NULL) + AVL_SETPARENT(delete->avl_child[which_child], delete); + } + + + /* + * Here we know "delete" is at least partially a leaf node. It can + * be easily removed from the tree. + */ + ASSERT(tree->avl_numnodes > 0); + --tree->avl_numnodes; + parent = AVL_XPARENT(delete); + which_child = AVL_XCHILD(delete); + if (delete->avl_child[0] != NULL) + node = delete->avl_child[0]; + else + node = delete->avl_child[1]; + + /* + * Connect parent directly to node (leaving out delete). + */ + if (node != NULL) { + AVL_SETPARENT(node, parent); + AVL_SETCHILD(node, which_child); + } + if (parent == NULL) { + tree->avl_root = node; + return; + } + parent->avl_child[which_child] = node; + + + /* + * Since the subtree is now shorter, begin adjusting parent balances + * and performing any needed rotations. + */ + do { + + /* + * Move up the tree and adjust the balance + * + * Capture the parent and which_child values for the next + * iteration before any rotations occur. + */ + node = parent; + old_balance = AVL_XBALANCE(node); + new_balance = old_balance - avl_child2balance[which_child]; + parent = AVL_XPARENT(node); + which_child = AVL_XCHILD(node); + + /* + * If a node was in perfect balance but isn't anymore then + * we can stop, since the height didn't change above this point + * due to a deletion. + */ + if (old_balance == 0) { + AVL_SETBALANCE(node, new_balance); + break; + } + + /* + * If the new balance is zero, we don't need to rotate + * else + * need a rotation to fix the balance. + * If the rotation doesn't change the height + * of the sub-tree we have finished adjusting. + */ + if (new_balance == 0) + AVL_SETBALANCE(node, new_balance); + else if (!avl_rotation(tree, node, new_balance)) + break; + } while (parent != NULL); +} + +/* + * initialize a new AVL tree + */ +void +avl_create(avl_tree_t *tree, int (*compar) (const void *, const void *), + size_t size, size_t offset) +{ + ASSERT(tree); + ASSERT(compar); + ASSERT(size > 0); + ASSERT(size >= offset + sizeof (avl_node_t)); +#ifdef _LP64 + ASSERT((offset & 0x7) == 0); +#endif + + tree->avl_compar = compar; + tree->avl_root = NULL; + tree->avl_numnodes = 0; + tree->avl_size = size; + tree->avl_offset = offset; +} + +/* + * Delete a tree. + */ +/* ARGSUSED */ +void +avl_destroy(avl_tree_t *tree) +{ + ASSERT(tree); + ASSERT(tree->avl_numnodes == 0); + ASSERT(tree->avl_root == NULL); +} + + +/* + * Return the number of nodes in an AVL tree. + */ +ulong_t +avl_numnodes(avl_tree_t *tree) +{ + ASSERT(tree); + return (tree->avl_numnodes); +} + + +#define CHILDBIT (1L) + +/* + * Post-order tree walk used to visit all tree nodes and destroy the tree + * in post order. This is used for destroying a tree w/o paying any cost + * for rebalancing it. + * + * example: + * + * void *cookie = NULL; + * my_data_t *node; + * + * while ((node = avl_destroy_nodes(tree, &cookie)) != NULL) + * free(node); + * avl_destroy(tree); + * + * The cookie is really an avl_node_t to the current node's parent and + * an indication of which child you looked at last. + * + * On input, a cookie value of CHILDBIT indicates the tree is done. + */ +void * +avl_destroy_nodes(avl_tree_t *tree, void **cookie) +{ + avl_node_t *node; + avl_node_t *parent; + int child; + void *first; + size_t off = tree->avl_offset; + + /* + * Initial calls go to the first node or it's right descendant. + */ + if (*cookie == NULL) { + first = avl_first(tree); + + /* + * deal with an empty tree + */ + if (first == NULL) { + *cookie = (void *)CHILDBIT; + return (NULL); + } + + node = AVL_DATA2NODE(first, off); + parent = AVL_XPARENT(node); + goto check_right_side; + } + + /* + * If there is no parent to return to we are done. + */ + parent = (avl_node_t *)((uintptr_t)(*cookie) & ~CHILDBIT); + if (parent == NULL) { + if (tree->avl_root != NULL) { + ASSERT(tree->avl_numnodes == 1); + tree->avl_root = NULL; + tree->avl_numnodes = 0; + } + return (NULL); + } + + /* + * Remove the child pointer we just visited from the parent and tree. + */ + child = (uintptr_t)(*cookie) & CHILDBIT; + parent->avl_child[child] = NULL; + ASSERT(tree->avl_numnodes > 1); + --tree->avl_numnodes; + + /* + * If we just did a right child or there isn't one, go up to parent. + */ + if (child == 1 || parent->avl_child[1] == NULL) { + node = parent; + parent = AVL_XPARENT(parent); + goto done; + } + + /* + * Do parent's right child, then leftmost descendent. + */ + node = parent->avl_child[1]; + while (node->avl_child[0] != NULL) { + parent = node; + node = node->avl_child[0]; + } + + /* + * If here, we moved to a left child. It may have one + * child on the right (when balance == +1). + */ +check_right_side: + if (node->avl_child[1] != NULL) { + ASSERT(AVL_XBALANCE(node) == 1); + parent = node; + node = node->avl_child[1]; + ASSERT(node->avl_child[0] == NULL && + node->avl_child[1] == NULL); + } else { + ASSERT(AVL_XBALANCE(node) <= 0); + } + +done: + if (parent == NULL) { + *cookie = (void *)CHILDBIT; + ASSERT(node == tree->avl_root); + } else { + *cookie = (void *)((uintptr_t)parent | AVL_XCHILD(node)); + } + + return (AVL_NODE2DATA(node, off)); +} diff --git a/zfs/lib/libavl/include/Makefile.in b/zfs/lib/libavl/include/Makefile.in new file mode 100644 index 0000000000..6611e4143d --- /dev/null +++ b/zfs/lib/libavl/include/Makefile.in @@ -0,0 +1 @@ +subdir-m += sys diff --git a/zfs/lib/libavl/include/sys/Makefile.in b/zfs/lib/libavl/include/sys/Makefile.in new file mode 100644 index 0000000000..8149c3861f --- /dev/null +++ b/zfs/lib/libavl/include/sys/Makefile.in @@ -0,0 +1 @@ +DISTFILES = avl.h avl_impl.h diff --git a/zfs/lib/libavl/include/sys/avl.h b/zfs/lib/libavl/include/sys/avl.h new file mode 100644 index 0000000000..1c1f11b2b2 --- /dev/null +++ b/zfs/lib/libavl/include/sys/avl.h @@ -0,0 +1,298 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _AVL_H +#define _AVL_H + + + +/* + * This is a private header file. Applications should not directly include + * this file. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +/* + * This is a generic implemenatation of AVL trees for use in the Solaris kernel. + * The interfaces provide an efficient way of implementing an ordered set of + * data structures. + * + * AVL trees provide an alternative to using an ordered linked list. Using AVL + * trees will usually be faster, however they requires more storage. An ordered + * linked list in general requires 2 pointers in each data structure. The + * AVL tree implementation uses 3 pointers. The following chart gives the + * approximate performance of operations with the different approaches: + * + * Operation Link List AVL tree + * --------- -------- -------- + * lookup O(n) O(log(n)) + * + * insert 1 node constant constant + * + * delete 1 node constant between constant and O(log(n)) + * + * delete all nodes O(n) O(n) + * + * visit the next + * or prev node constant between constant and O(log(n)) + * + * + * The data structure nodes are anchored at an "avl_tree_t" (the equivalent + * of a list header) and the individual nodes will have a field of + * type "avl_node_t" (corresponding to list pointers). + * + * The type "avl_index_t" is used to indicate a position in the list for + * certain calls. + * + * The usage scenario is generally: + * + * 1. Create the list/tree with: avl_create() + * + * followed by any mixture of: + * + * 2a. Insert nodes with: avl_add(), or avl_find() and avl_insert() + * + * 2b. Visited elements with: + * avl_first() - returns the lowest valued node + * avl_last() - returns the highest valued node + * AVL_NEXT() - given a node go to next higher one + * AVL_PREV() - given a node go to previous lower one + * + * 2c. Find the node with the closest value either less than or greater + * than a given value with avl_nearest(). + * + * 2d. Remove individual nodes from the list/tree with avl_remove(). + * + * and finally when the list is being destroyed + * + * 3. Use avl_destroy_nodes() to quickly process/free up any remaining nodes. + * Note that once you use avl_destroy_nodes(), you can no longer + * use any routine except avl_destroy_nodes() and avl_destoy(). + * + * 4. Use avl_destroy() to destroy the AVL tree itself. + * + * Any locking for multiple thread access is up to the user to provide, just + * as is needed for any linked list implementation. + */ + + +/* + * Type used for the root of the AVL tree. + */ +typedef struct avl_tree avl_tree_t; + +/* + * The data nodes in the AVL tree must have a field of this type. + */ +typedef struct avl_node avl_node_t; + +/* + * An opaque type used to locate a position in the tree where a node + * would be inserted. + */ +typedef uintptr_t avl_index_t; + + +/* + * Direction constants used for avl_nearest(). + */ +#define AVL_BEFORE (0) +#define AVL_AFTER (1) + + + +/* + * Prototypes + * + * Where not otherwise mentioned, "void *" arguments are a pointer to the + * user data structure which must contain a field of type avl_node_t. + * + * Also assume the user data structures looks like: + * stuct my_type { + * ... + * avl_node_t my_link; + * ... + * }; + */ + +/* + * Initialize an AVL tree. Arguments are: + * + * tree - the tree to be initialized + * compar - function to compare two nodes, it must return exactly: -1, 0, or +1 + * -1 for <, 0 for ==, and +1 for > + * size - the value of sizeof(struct my_type) + * offset - the value of OFFSETOF(struct my_type, my_link) + */ +extern void avl_create(avl_tree_t *tree, + int (*compar) (const void *, const void *), size_t size, size_t offset); + + +/* + * Find a node with a matching value in the tree. Returns the matching node + * found. If not found, it returns NULL and then if "where" is not NULL it sets + * "where" for use with avl_insert() or avl_nearest(). + * + * node - node that has the value being looked for + * where - position for use with avl_nearest() or avl_insert(), may be NULL + */ +extern void *avl_find(avl_tree_t *tree, void *node, avl_index_t *where); + +/* + * Insert a node into the tree. + * + * node - the node to insert + * where - position as returned from avl_find() + */ +extern void avl_insert(avl_tree_t *tree, void *node, avl_index_t where); + +/* + * Insert "new_data" in "tree" in the given "direction" either after + * or before the data "here". + * + * This might be usefull for avl clients caching recently accessed + * data to avoid doing avl_find() again for insertion. + * + * new_data - new data to insert + * here - existing node in "tree" + * direction - either AVL_AFTER or AVL_BEFORE the data "here". + */ +extern void avl_insert_here(avl_tree_t *tree, void *new_data, void *here, + int direction); + + +/* + * Return the first or last valued node in the tree. Will return NULL + * if the tree is empty. + * + */ +extern void *avl_first(avl_tree_t *tree); +extern void *avl_last(avl_tree_t *tree); + + +/* + * Return the next or previous valued node in the tree. + * AVL_NEXT() will return NULL if at the last node. + * AVL_PREV() will return NULL if at the first node. + * + * node - the node from which the next or previous node is found + */ +#define AVL_NEXT(tree, node) avl_walk(tree, node, AVL_AFTER) +#define AVL_PREV(tree, node) avl_walk(tree, node, AVL_BEFORE) + + +/* + * Find the node with the nearest value either greater or less than + * the value from a previous avl_find(). Returns the node or NULL if + * there isn't a matching one. + * + * where - position as returned from avl_find() + * direction - either AVL_BEFORE or AVL_AFTER + * + * EXAMPLE get the greatest node that is less than a given value: + * + * avl_tree_t *tree; + * struct my_data look_for_value = {....}; + * struct my_data *node; + * struct my_data *less; + * avl_index_t where; + * + * node = avl_find(tree, &look_for_value, &where); + * if (node != NULL) + * less = AVL_PREV(tree, node); + * else + * less = avl_nearest(tree, where, AVL_BEFORE); + */ +extern void *avl_nearest(avl_tree_t *tree, avl_index_t where, int direction); + + +/* + * Add a single node to the tree. + * The node must not be in the tree, and it must not + * compare equal to any other node already in the tree. + * + * node - the node to add + */ +extern void avl_add(avl_tree_t *tree, void *node); + + +/* + * Remove a single node from the tree. The node must be in the tree. + * + * node - the node to remove + */ +extern void avl_remove(avl_tree_t *tree, void *node); + + +/* + * Return the number of nodes in the tree + */ +extern ulong_t avl_numnodes(avl_tree_t *tree); + + +/* + * Used to destroy any remaining nodes in a tree. The cookie argument should + * be initialized to NULL before the first call. Returns a node that has been + * removed from the tree and may be free()'d. Returns NULL when the tree is + * empty. + * + * Once you call avl_destroy_nodes(), you can only continuing calling it and + * finally avl_destroy(). No other AVL routines will be valid. + * + * cookie - a "void *" used to save state between calls to avl_destroy_nodes() + * + * EXAMPLE: + * avl_tree_t *tree; + * struct my_data *node; + * void *cookie; + * + * cookie = NULL; + * while ((node = avl_destroy_nodes(tree, &cookie)) != NULL) + * free(node); + * avl_destroy(tree); + */ +extern void *avl_destroy_nodes(avl_tree_t *tree, void **cookie); + + +/* + * Final destroy of an AVL tree. Arguments are: + * + * tree - the empty tree to destroy + */ +extern void avl_destroy(avl_tree_t *tree); + + + +#ifdef __cplusplus +} +#endif + +#endif /* _AVL_H */ diff --git a/zfs/lib/libavl/include/sys/avl_impl.h b/zfs/lib/libavl/include/sys/avl_impl.h new file mode 100644 index 0000000000..fddf76906d --- /dev/null +++ b/zfs/lib/libavl/include/sys/avl_impl.h @@ -0,0 +1,164 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _AVL_IMPL_H +#define _AVL_IMPL_H + + + +/* + * This is a private header file. Applications should not directly include + * this file. + */ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + + +/* + * generic AVL tree implementation for kernel use + * + * There are 5 pieces of information stored for each node in an AVL tree + * + * pointer to less than child + * pointer to greater than child + * a pointer to the parent of this node + * an indication [0/1] of which child I am of my parent + * a "balance" (-1, 0, +1) indicating which child tree is taller + * + * Since they only need 3 bits, the last two fields are packed into the + * bottom bits of the parent pointer on 64 bit machines to save on space. + */ + +#ifndef _LP64 + +struct avl_node { + struct avl_node *avl_child[2]; /* left/right children */ + struct avl_node *avl_parent; /* this node's parent */ + unsigned short avl_child_index; /* my index in parent's avl_child[] */ + short avl_balance; /* balance value: -1, 0, +1 */ +}; + +#define AVL_XPARENT(n) ((n)->avl_parent) +#define AVL_SETPARENT(n, p) ((n)->avl_parent = (p)) + +#define AVL_XCHILD(n) ((n)->avl_child_index) +#define AVL_SETCHILD(n, c) ((n)->avl_child_index = (unsigned short)(c)) + +#define AVL_XBALANCE(n) ((n)->avl_balance) +#define AVL_SETBALANCE(n, b) ((n)->avl_balance = (short)(b)) + +#else /* _LP64 */ + +/* + * for 64 bit machines, avl_pcb contains parent pointer, balance and child_index + * values packed in the following manner: + * + * |63 3| 2 |1 0 | + * |-------------------------------------|-----------------|-------------| + * | avl_parent hi order bits | avl_child_index | avl_balance | + * | | | + 1 | + * |-------------------------------------|-----------------|-------------| + * + */ +struct avl_node { + struct avl_node *avl_child[2]; /* left/right children nodes */ + uintptr_t avl_pcb; /* parent, child_index, balance */ +}; + +/* + * macros to extract/set fields in avl_pcb + * + * pointer to the parent of the current node is the high order bits + */ +#define AVL_XPARENT(n) ((struct avl_node *)((n)->avl_pcb & ~7)) +#define AVL_SETPARENT(n, p) \ + ((n)->avl_pcb = (((n)->avl_pcb & 7) | (uintptr_t)(p))) + +/* + * index of this node in its parent's avl_child[]: bit #2 + */ +#define AVL_XCHILD(n) (((n)->avl_pcb >> 2) & 1) +#define AVL_SETCHILD(n, c) \ + ((n)->avl_pcb = (uintptr_t)(((n)->avl_pcb & ~4) | ((c) << 2))) + +/* + * balance indication for a node, lowest 2 bits. A valid balance is + * -1, 0, or +1, and is encoded by adding 1 to the value to get the + * unsigned values of 0, 1, 2. + */ +#define AVL_XBALANCE(n) ((int)(((n)->avl_pcb & 3) - 1)) +#define AVL_SETBALANCE(n, b) \ + ((n)->avl_pcb = (uintptr_t)((((n)->avl_pcb & ~3) | ((b) + 1)))) + +#endif /* _LP64 */ + + + +/* + * switch between a node and data pointer for a given tree + * the value of "o" is tree->avl_offset + */ +#define AVL_NODE2DATA(n, o) ((void *)((uintptr_t)(n) - (o))) +#define AVL_DATA2NODE(d, o) ((struct avl_node *)((uintptr_t)(d) + (o))) + + + +/* + * macros used to create/access an avl_index_t + */ +#define AVL_INDEX2NODE(x) ((avl_node_t *)((x) & ~1)) +#define AVL_INDEX2CHILD(x) ((x) & 1) +#define AVL_MKINDEX(n, c) ((avl_index_t)(n) | (c)) + + +/* + * The tree structure. The fields avl_root, avl_compar, and avl_offset come + * first since they are needed for avl_find(). We want them to fit into + * a single 64 byte cache line to make avl_find() as fast as possible. + */ +struct avl_tree { + struct avl_node *avl_root; /* root node in tree */ + int (*avl_compar)(const void *, const void *); + size_t avl_offset; /* offsetof(type, avl_link_t field) */ + ulong_t avl_numnodes; /* number of nodes in the tree */ + size_t avl_size; /* sizeof user type struct */ +}; + + +/* + * This will only by used via AVL_NEXT() or AVL_PREV() + */ +extern void *avl_walk(struct avl_tree *, void *, int); + +#ifdef __cplusplus +} +#endif + +#endif /* _AVL_IMPL_H */ diff --git a/zfs/lib/libdmu-ctl/Makefile.in b/zfs/lib/libdmu-ctl/Makefile.in new file mode 100644 index 0000000000..c4017b254c --- /dev/null +++ b/zfs/lib/libdmu-ctl/Makefile.in @@ -0,0 +1,28 @@ +# NOTE: dctl_client.c, dctl_common.c, dctl_server.c, dctl_thrpool.c unused +# by kernel port. Potentially they should just be removed if we don't care +# able user space lustre intergration from this source base. + +# NOTE: For clarity this directly should simply be renamed libzpl and +# the full kernel implementation should be minimally stubbed out. + +subdir-m += include +DISTFILES = dctl_client.c dctl_common.c dctl_server.c dctl_thrpool.c +DISTFILES += dmu_send.c rrwlock.c zfs_acl.c zfs_ctldir.c +DISTFILES += zfs_dir.c zfs_fuid.c zfs_ioctl.c zfs_log.c zfs_replay.c +DISTFILES += zfs_rlock.c zfs_vfsops.c zfs_vnops.c zvol.c + +MODULE := zctl + +EXTRA_CFLAGS = @KERNELCPPFLAGS@ +EXTRA_CFLAGS += -I@LIBDIR@/libzcommon/include +EXTRA_CFLAGS += -I@LIBDIR@/libdmu-ctl/include +EXTRA_CFLAGS += -I@LIBDIR@/libavl/include +EXTRA_CFLAGS += -I@LIBDIR@/libport/include +EXTRA_CFLAGS += -I@LIBDIR@/libnvpair/include + +obj-m := ${MODULE}.o + +${MODULE}-objs += zvol.o # Volume emulation interface +${MODULE}-objs += zfs_ioctl.o # /dev/zfs_ioctl interface +${MODULE}-objs += zfs_vfsops.o +${MODULE}-objs += dmu_send.o diff --git a/zfs/lib/libdmu-ctl/dctl_client.c b/zfs/lib/libdmu-ctl/dctl_client.c new file mode 100644 index 0000000000..e3d8f305bc --- /dev/null +++ b/zfs/lib/libdmu-ctl/dctl_client.c @@ -0,0 +1,263 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +/* + * Try to connect to the socket given in path. + * + * For nftw() convenience, returns 0 if unsuccessful, otherwise + * returns the socket descriptor. + */ +static int try_connect(const char *path) +{ + struct sockaddr_un name; + int sock; + + sock = socket(PF_UNIX, SOCK_STREAM, 0); + if (sock == -1) { + perror("socket"); + return 0; + } + + /* + * The socket fd cannot be 0 otherwise nftw() will not interpret the + * return code correctly. + */ + VERIFY(sock != 0); + + name.sun_family = AF_UNIX; + strncpy(name.sun_path, path, sizeof(name.sun_path)); + + name.sun_path[sizeof(name.sun_path) - 1] = '\0'; + + if (connect(sock, (struct sockaddr *) &name, sizeof(name)) == -1) { + close(sock); + return 0; + } + + return sock; +} + +/* + * nftw() callback. + */ +static int nftw_cb(const char *fpath, const struct stat *sb, int typeflag, + struct FTW *ftwbuf) +{ + if (!S_ISSOCK(sb->st_mode)) + return 0; + + if (strcmp(&fpath[ftwbuf->base], SOCKNAME) != 0) + return 0; + + return try_connect(fpath); +} + +/* + * For convenience, if check_subdirs is true we walk the directory tree to + * find a good socket. + */ +int dctlc_connect(const char *dir, boolean_t check_subdirs) +{ + char *fpath; + int fd; + + if (check_subdirs) + fd = nftw(dir, nftw_cb, 10, FTW_PHYS); + else { + fpath = malloc(strlen(dir) + strlen(SOCKNAME) + 2); + if (fpath == NULL) + return -1; + + strcpy(fpath, dir); + strcat(fpath, "/" SOCKNAME); + + fd = try_connect(fpath); + + free(fpath); + } + + return fd == 0 ? -1 : fd; +} + +void dctlc_disconnect(int fd) +{ + (void) shutdown(fd, SHUT_RDWR); +} + +static int dctl_reply_copyin(int fd, dctl_cmd_t *cmd) +{ + return dctl_send_data(fd, (void *)(uintptr_t) cmd->u.dcmd_copy.ptr, + cmd->u.dcmd_copy.size); +} + +static int dctl_reply_copyinstr(int fd, dctl_cmd_t *cmd) +{ + dctl_cmd_t reply; + char *from; + size_t len, buflen, to_copy; + int error; + + reply.dcmd_msg = DCTL_GEN_REPLY; + + from = (char *)(uintptr_t) cmd->u.dcmd_copy.ptr; + + buflen = cmd->u.dcmd_copy.size; + to_copy = strnlen(from, buflen - 1); + + reply.u.dcmd_reply.rc = from[to_copy] == '\0' ? 0 : ENAMETOOLONG; + reply.u.dcmd_reply.size = to_copy; + + error = dctl_send_msg(fd, &reply); + + if (!error && to_copy > 0) + error = dctl_send_data(fd, from, to_copy); + + return error; +} + +static int dctl_reply_copyout(int fd, dctl_cmd_t *cmd) +{ + return dctl_read_data(fd, (void *)(uintptr_t) cmd->u.dcmd_copy.ptr, + cmd->u.dcmd_copy.size); +} + +static int dctl_reply_fd_read(int fd, dctl_cmd_t *cmd) +{ + dctl_cmd_t reply; + void *buf; + int error; + ssize_t rrc, size = cmd->u.dcmd_fd_io.size; + + buf = malloc(size); + if (buf == NULL) + return ENOMEM; + + rrc = read(cmd->u.dcmd_fd_io.fd, buf, size); + + reply.dcmd_msg = DCTL_GEN_REPLY; + reply.u.dcmd_reply.rc = rrc == -1 ? errno : 0; + reply.u.dcmd_reply.size = rrc; + + error = dctl_send_msg(fd, &reply); + + if (!error && rrc > 0) + error = dctl_send_data(fd, buf, rrc); + +out: + free(buf); + + return error; +} + +static int dctl_reply_fd_write(int fd, dctl_cmd_t *cmd) +{ + dctl_cmd_t reply; + void *buf; + int error; + ssize_t wrc, size = cmd->u.dcmd_fd_io.size; + + buf = malloc(size); + if (buf == NULL) + return ENOMEM; + + error = dctl_read_data(fd, buf, size); + if (error) + goto out; + + wrc = write(cmd->u.dcmd_fd_io.fd, buf, size); + + reply.dcmd_msg = DCTL_GEN_REPLY; + reply.u.dcmd_reply.rc = wrc == -1 ? errno : 0; + reply.u.dcmd_reply.size = wrc; + + error = dctl_send_msg(fd, &reply); + +out: + free(buf); + + return error; +} + +int dctlc_ioctl(int fd, int32_t request, void *arg) +{ + int error; + dctl_cmd_t cmd; + + ASSERT(fd != 0); + + cmd.dcmd_msg = DCTL_IOCTL; + + cmd.u.dcmd_ioctl.cmd = request; + cmd.u.dcmd_ioctl.arg = (uintptr_t) arg; + + error = dctl_send_msg(fd, &cmd); + + while (!error && (error = dctl_read_msg(fd, &cmd)) == 0) { + switch (cmd.dcmd_msg) { + case DCTL_IOCTL_REPLY: + error = cmd.u.dcmd_reply.rc; + goto out; + case DCTL_COPYIN: + error = dctl_reply_copyin(fd, &cmd); + break; + case DCTL_COPYINSTR: + error = dctl_reply_copyinstr(fd, &cmd); + break; + case DCTL_COPYOUT: + error = dctl_reply_copyout(fd, &cmd); + break; + case DCTL_FD_READ: + error = dctl_reply_fd_read(fd, &cmd); + break; + case DCTL_FD_WRITE: + error = dctl_reply_fd_write(fd, &cmd); + break; + default: + fprintf(stderr, "%s(): invalid message " + "received.\n", __func__); + error = EINVAL; + goto out; + } + } + +out: + errno = error; + return error ? -1 : 0; +} diff --git a/zfs/lib/libdmu-ctl/dctl_common.c b/zfs/lib/libdmu-ctl/dctl_common.c new file mode 100644 index 0000000000..8de37dcb1e --- /dev/null +++ b/zfs/lib/libdmu-ctl/dctl_common.c @@ -0,0 +1,109 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include +#include + +#include +#include + +int dctl_read_msg(int fd, dctl_cmd_t *cmd) +{ + int error; + + /* + * First, read only the magic number and the protocol version. + * + * This prevents blocking forever in case the size of dctl_cmd_t + * shrinks in future protocol versions. + */ + error = dctl_read_data(fd, cmd, DCTL_CMD_HEADER_SIZE); + + if (!error &&cmd->dcmd_magic != DCTL_MAGIC) { + fprintf(stderr, "%s(): invalid magic number\n", __func__); + error = EIO; + } + + if (!error && cmd->dcmd_version != DCTL_PROTOCOL_VER) { + fprintf(stderr, "%s(): invalid protocol version\n", __func__); + error = ENOTSUP; + } + + if (error) + return error; + + /* Get the rest of the command */ + return dctl_read_data(fd, (caddr_t) cmd + DCTL_CMD_HEADER_SIZE, + sizeof(dctl_cmd_t) - DCTL_CMD_HEADER_SIZE); +} + +int dctl_send_msg(int fd, dctl_cmd_t *cmd) +{ + cmd->dcmd_magic = DCTL_MAGIC; + cmd->dcmd_version = DCTL_PROTOCOL_VER; + + return dctl_send_data(fd, cmd, sizeof(dctl_cmd_t)); +} + +int dctl_read_data(int fd, void *ptr, size_t size) +{ + size_t read = 0; + size_t left = size; + ssize_t rc; + + while (left > 0) { + rc = recv(fd, (caddr_t) ptr + read, left, 0); + + /* File descriptor closed */ + if (rc == 0) + return ECONNRESET; + + if (rc == -1) { + if (errno == EINTR) + continue; + return errno; + } + + read += rc; + left -= rc; + } + + return 0; +} + +int dctl_send_data(int fd, const void *ptr, size_t size) +{ + ssize_t rc; + + do { + rc = send(fd, ptr, size, MSG_NOSIGNAL); + } while(rc == -1 && errno == EINTR); + + return rc == size ? 0 : EIO; +} + diff --git a/zfs/lib/libdmu-ctl/dctl_server.c b/zfs/lib/libdmu-ctl/dctl_server.c new file mode 100644 index 0000000000..016278509b --- /dev/null +++ b/zfs/lib/libdmu-ctl/dctl_server.c @@ -0,0 +1,476 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +static dctl_sock_info_t ctl_sock = { + .dsi_mtx = PTHREAD_MUTEX_INITIALIZER, + .dsi_fd = -1 +}; + +static int dctl_create_socket_common(); + +/* + * Routines from zfs_ioctl.c + */ +extern int zfs_ioctl_init(); +extern int zfs_ioctl_fini(); +extern int zfsdev_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, + int *rvalp); + +/* + * We can't simply put the client file descriptor in wthr_info_t because we + * have no way of accessing it from the DMU code without extensive + * modifications. + * + * Therefore each worker thread will have it's own global thread-specific + * client_fd variable. + */ +static __thread int client_fd = -1; + +int dctls_copyin(const void *src, void *dest, size_t size) +{ + dctl_cmd_t cmd; + + VERIFY(client_fd >= 0); + + cmd.dcmd_msg = DCTL_COPYIN; + cmd.u.dcmd_copy.ptr = (uintptr_t) src; + cmd.u.dcmd_copy.size = size; + + if (dctl_send_msg(client_fd, &cmd) != 0) + return EFAULT; + + if (dctl_read_data(client_fd, dest, size) != 0) + return EFAULT; + + return 0; +} + +int dctls_copyinstr(const char *from, char *to, size_t max, size_t *len) +{ + dctl_cmd_t msg; + size_t copied; + + VERIFY(client_fd >= 0); + + if (max == 0) + return ENAMETOOLONG; + if (max < 0) + return EFAULT; + + msg.dcmd_msg = DCTL_COPYINSTR; + msg.u.dcmd_copy.ptr = (uintptr_t) from; + msg.u.dcmd_copy.size = max; + + if (dctl_send_msg(client_fd, &msg) != 0) + return EFAULT; + + if (dctl_read_msg(client_fd, &msg) != 0) + return EFAULT; + + if (msg.dcmd_msg != DCTL_GEN_REPLY) + return EFAULT; + + copied = msg.u.dcmd_reply.size; + + if (copied >= max) + return EFAULT; + + if (copied > 0) + if (dctl_read_data(client_fd, to, copied) != 0) + return EFAULT; + + to[copied] = '\0'; + + if (len != NULL) + *len = copied + 1; + + return msg.u.dcmd_reply.rc; +} + +int dctls_copyout(const void *src, void *dest, size_t size) +{ + dctl_cmd_t cmd; + + VERIFY(client_fd >= 0); + + cmd.dcmd_msg = DCTL_COPYOUT; + cmd.u.dcmd_copy.ptr = (uintptr_t) dest; + cmd.u.dcmd_copy.size = size; + + if (dctl_send_msg(client_fd, &cmd) != 0) + return EFAULT; + + if (dctl_send_data(client_fd, src, size) != 0) + return EFAULT; + + return 0; +} + +int dctls_fd_read(int fd, void *buf, ssize_t len, ssize_t *residp) +{ + dctl_cmd_t msg; + uint64_t dsize; + int error; + + VERIFY(client_fd >= 0); + + msg.dcmd_msg = DCTL_FD_READ; + msg.u.dcmd_fd_io.fd = fd; + msg.u.dcmd_fd_io.size = len; + + if ((error = dctl_send_msg(client_fd, &msg)) != 0) + return error; + + if ((error = dctl_read_msg(client_fd, &msg)) != 0) + return error; + + if (msg.dcmd_msg != DCTL_GEN_REPLY) + return EIO; + + if (msg.u.dcmd_reply.rc != 0) + return msg.u.dcmd_reply.rc; + + dsize = msg.u.dcmd_reply.size; + + if (dsize > 0) + error = dctl_read_data(client_fd, buf, dsize); + + *residp = len - dsize; + + return error; +} + +int dctls_fd_write(int fd, const void *src, ssize_t len) +{ + dctl_cmd_t msg; + int error; + + VERIFY(client_fd >= 0); + + msg.dcmd_msg = DCTL_FD_WRITE; + msg.u.dcmd_fd_io.fd = fd; + msg.u.dcmd_fd_io.size = len; + + error = dctl_send_msg(client_fd, &msg); + + if (!error) + error = dctl_send_data(client_fd, src, len); + + if (!error) + error = dctl_read_msg(client_fd, &msg); + + if (error) + return error; + + if (msg.dcmd_msg != DCTL_GEN_REPLY) + return EIO; + + if (msg.u.dcmd_reply.rc != 0) + return msg.u.dcmd_reply.rc; + + /* + * We have to do this because the original upstream code + * does not check if residp == len. + */ + if (msg.u.dcmd_reply.size != len) + return EIO; + + return 0; +} + +/* Handle a new connection */ +static void dctl_handle_conn(int sock_fd) +{ + dctl_cmd_t cmd; + dev_t dev = { 0 }; + int rc; + + client_fd = sock_fd; + + while (dctl_read_msg(sock_fd, &cmd) == 0) { + if (cmd.dcmd_msg != DCTL_IOCTL) { + fprintf(stderr, "%s(): unexpected message type.\n", + __func__); + break; + } + + rc = zfsdev_ioctl(dev, cmd.u.dcmd_ioctl.cmd, + (intptr_t) cmd.u.dcmd_ioctl.arg, 0, NULL, NULL); + + cmd.dcmd_msg = DCTL_IOCTL_REPLY; + cmd.u.dcmd_reply.rc = rc; + + if (dctl_send_msg(sock_fd, &cmd) != 0) + break; + } + close(sock_fd); + + client_fd = -1; +} + +/* Main worker thread loop */ +static void *dctl_thread(void *arg) +{ + wthr_info_t *thr = arg; + struct pollfd fds[1]; + + fds[0].events = POLLIN; + + pthread_mutex_lock(&ctl_sock.dsi_mtx); + + while (!thr->wthr_exit) { + /* Clean-up dead threads */ + dctl_thr_join(); + + /* The file descriptor might change in the thread lifetime */ + fds[0].fd = ctl_sock.dsi_fd; + + /* Poll socket with 1-second timeout */ + int rc = poll(fds, 1, 1000); + if (rc == 0 || (rc == -1 && errno == EINTR)) + continue; + + /* Recheck the exit flag */ + if (thr->wthr_exit) + break; + + if (rc == -1) { + /* Unknown error, let's try to recreate the socket */ + close(ctl_sock.dsi_fd); + ctl_sock.dsi_fd = -1; + + if (dctl_create_socket_common() != 0) + break; + + continue; + } + ASSERT(rc == 1); + + short rev = fds[0].revents; + if (rev == 0) + continue; + ASSERT(rev == POLLIN); + + /* + * At this point there should be a connection ready to be + * accepted. + */ + int client_fd = accept(ctl_sock.dsi_fd, NULL, NULL); + /* Many possible errors here, we'll just retry */ + if (client_fd == -1) + continue; + + /* + * Now lets handle the request. This can take a very + * long time (hours even), so we'll let other threads + * handle new connections. + */ + pthread_mutex_unlock(&ctl_sock.dsi_mtx); + + dctl_thr_rebalance(thr, B_FALSE); + dctl_handle_conn(client_fd); + dctl_thr_rebalance(thr, B_TRUE); + + pthread_mutex_lock(&ctl_sock.dsi_mtx); + } + pthread_mutex_unlock(&ctl_sock.dsi_mtx); + + dctl_thr_die(thr); + + return NULL; +} + +static int dctl_create_socket_common() +{ + dctl_sock_info_t *s = &ctl_sock; + size_t size; + int error; + + ASSERT(s->dsi_fd == -1); + + /* + * Unlink old socket, in case it exists. + * We don't care about errors here. + */ + unlink(s->dsi_path); + + /* Create the socket */ + s->dsi_fd = socket(PF_UNIX, SOCK_STREAM, 0); + if (s->dsi_fd == -1) { + error = errno; + perror("socket"); + return error; + } + + s->dsi_addr.sun_family = AF_UNIX; + + size = sizeof(s->dsi_addr.sun_path) - 1; + strncpy(s->dsi_addr.sun_path, s->dsi_path, size); + + s->dsi_addr.sun_path[size] = '\0'; + + if (bind(s->dsi_fd, (struct sockaddr *) &s->dsi_addr, + sizeof(s->dsi_addr)) != 0) { + error = errno; + perror("bind"); + return error; + } + + if (listen(s->dsi_fd, LISTEN_BACKLOG) != 0) { + error = errno; + perror("listen"); + unlink(s->dsi_path); + return error; + } + + return 0; +} + +static int dctl_create_socket(const char *cfg_dir) +{ + int error; + dctl_sock_info_t *s = &ctl_sock; + + ASSERT(s->dsi_path == NULL); + ASSERT(s->dsi_fd == -1); + + int pathsize = strlen(cfg_dir) + strlen(SOCKNAME) + 2; + if (pathsize > sizeof(s->dsi_addr.sun_path)) + return ENAMETOOLONG; + + s->dsi_path = malloc(pathsize); + if (s->dsi_path == NULL) + return ENOMEM; + + strcpy(s->dsi_path, cfg_dir); + strcat(s->dsi_path, "/" SOCKNAME); + + /* + * For convenience, create the directory in case it doesn't exist. + * We don't care about errors here. + */ + mkdir(cfg_dir, 0770); + + error = dctl_create_socket_common(); + + if (error) { + free(s->dsi_path); + + if (s->dsi_fd != -1) { + close(s->dsi_fd); + s->dsi_fd = -1; + } + } + + return error; +} + +static void dctl_destroy_socket() +{ + dctl_sock_info_t *s = &ctl_sock; + + ASSERT(s->dsi_path != NULL); + ASSERT(s->dsi_fd != -1); + + close(s->dsi_fd); + s->dsi_fd = -1; + + unlink(s->dsi_path); + free(s->dsi_path); +} + +/* + * Initialize the DMU userspace control interface. + * This should be called after kernel_init(). + * + * Note that only very rarely we have more than a couple of simultaneous + * lzfs/lzpool connections. Since the thread pool grows automatically when all + * threads are busy, a good value for min_thr and max_free_thr is 2. + */ +int dctl_server_init(const char *cfg_dir, int min_thr, int max_free_thr) +{ + int error; + + ASSERT(min_thr > 0); + ASSERT(max_free_thr >= min_thr); + + error = zfs_ioctl_init(); + if (error) + return error; + + error = dctl_create_socket(cfg_dir); + if (error) { + (void) zfs_ioctl_fini(); + return error; + } + + error = dctl_thr_pool_create(min_thr, max_free_thr, dctl_thread); + if (error) { + (void) zfs_ioctl_fini(); + dctl_destroy_socket(); + return error; + } + + return 0; +} + +/* + * Terminate control interface. + * This should be called after closing all objsets, but before calling + * kernel_fini(). + * May return EBUSY if the SPA is busy. + * + * Thread pool destruction can take a while due to poll() + * timeout or due to a thread being busy (e.g. a backup is being taken). + */ +int dctl_server_fini() +{ + dctl_thr_pool_stop(); + dctl_destroy_socket(); + + return zfs_ioctl_fini(); +} diff --git a/zfs/lib/libdmu-ctl/dctl_thrpool.c b/zfs/lib/libdmu-ctl/dctl_thrpool.c new file mode 100644 index 0000000000..7b2f9b4c2a --- /dev/null +++ b/zfs/lib/libdmu-ctl/dctl_thrpool.c @@ -0,0 +1,253 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +static dctl_thr_info_t thr_pool = { + .dti_mtx = PTHREAD_MUTEX_INITIALIZER +}; + +/* + * Create n threads. + * Callers must acquire thr_pool.dti_mtx first. + */ +static int dctl_thr_create(int n) +{ + dctl_thr_info_t *p = &thr_pool; + int error; + + for (int i = 0; i < n; i++) { + wthr_info_t *thr = malloc(sizeof(wthr_info_t)); + if (thr == NULL) + return ENOMEM; + + thr->wthr_exit = B_FALSE; + thr->wthr_free = B_TRUE; + + error = pthread_create(&thr->wthr_id, NULL, p->dti_thr_func, + thr); + if (error) { + free(thr); + return error; + } + + p->dti_free++; + + list_insert_tail(&p->dti_list, thr); + } + return 0; +} + +/* + * Mark the thread as dead. + * Must be called right before exiting the main thread function. + */ +void dctl_thr_die(wthr_info_t *thr) +{ + dctl_thr_info_t *p = &thr_pool; + + thr->wthr_exit = B_TRUE; + dctl_thr_rebalance(thr, B_FALSE); + + pthread_mutex_lock(&p->dti_mtx); + + list_remove(&p->dti_list, thr); + list_insert_tail(&p->dti_join_list, thr); + + pthread_mutex_unlock(&p->dti_mtx); +} + +/* + * Clean-up dead threads. + */ +void dctl_thr_join() +{ + dctl_thr_info_t *p = &thr_pool; + wthr_info_t *thr; + + pthread_mutex_lock(&p->dti_mtx); + + while ((thr = list_head(&p->dti_join_list))) { + list_remove(&p->dti_join_list, thr); + + ASSERT(!pthread_equal(thr->wthr_id, pthread_self())); + + /* + * This should not block because all the threads + * on this list should have died already. + * + * pthread_join() can only return an error if + * we made a programming mistake. + */ + VERIFY(pthread_join(thr->wthr_id, NULL) == 0); + + ASSERT(thr->wthr_exit); + ASSERT(!thr->wthr_free); + + free(thr); + } + + pthread_mutex_unlock(&p->dti_mtx); +} + +/* + * Adjust the number of free threads in the pool and the thread status. + * + * Callers must acquire thr_pool.dti_mtx first. + */ +static void dctl_thr_adjust_free(wthr_info_t *thr, boolean_t set_free) +{ + dctl_thr_info_t *p = &thr_pool; + + ASSERT(p->dti_free >= 0); + + if (!thr->wthr_free && set_free) + p->dti_free++; + else if (thr->wthr_free && !set_free) + p->dti_free--; + + ASSERT(p->dti_free >= 0); + + thr->wthr_free = set_free; +} + +/* + * Rebalance threads. Also adjusts the free status of the thread. + * Will set the thread exit flag if the number of free threads is above + * the limit. + */ +void dctl_thr_rebalance(wthr_info_t *thr, boolean_t set_free) +{ + dctl_thr_info_t *p = &thr_pool; + + pthread_mutex_lock(&p->dti_mtx); + + if (p->dti_exit || p->dti_free > p->dti_max_free) + thr->wthr_exit = B_TRUE; + + if (thr->wthr_exit) + set_free = B_FALSE; + + dctl_thr_adjust_free(thr, set_free); + + if (!p->dti_exit && p->dti_free == 0) + dctl_thr_create(1); + + pthread_mutex_unlock(&p->dti_mtx); +} + +/* + * Stop the thread pool. + * + * This can take a while since it actually waits for all threads to exit. + */ +void dctl_thr_pool_stop() +{ + dctl_thr_info_t *p = &thr_pool; + wthr_info_t *thr; + struct timespec ts; + + pthread_mutex_lock(&p->dti_mtx); + + ASSERT(!p->dti_exit); + p->dti_exit = B_TRUE; + + /* Let's flag the threads first */ + thr = list_head(&p->dti_list); + while (thr != NULL) { + thr->wthr_exit = B_TRUE; + dctl_thr_adjust_free(thr, B_FALSE); + + thr = list_next(&p->dti_list, thr); + } + + pthread_mutex_unlock(&p->dti_mtx); + + /* Now let's wait for them to exit */ + ts.tv_sec = 0; + ts.tv_nsec = 50000000; /* 50ms */ + do { + nanosleep(&ts, NULL); + + pthread_mutex_lock(&p->dti_mtx); + thr = list_head(&p->dti_list); + pthread_mutex_unlock(&p->dti_mtx); + + dctl_thr_join(); + } while(thr != NULL); + + ASSERT(p->dti_free == 0); + + ASSERT(list_is_empty(&p->dti_list)); + ASSERT(list_is_empty(&p->dti_join_list)); + + list_destroy(&p->dti_list); + list_destroy(&p->dti_join_list); +} + +/* + * Create thread pool. + * + * If at least one thread creation fails, it will stop all previous + * threads and return a non-zero value. + */ +int dctl_thr_pool_create(int min_thr, int max_free_thr, + thr_func_t *thr_func) +{ + int error; + dctl_thr_info_t *p = &thr_pool; + + ASSERT(p->dti_free == 0); + + /* Initialize global variables */ + p->dti_min = min_thr; + p->dti_max_free = max_free_thr; + p->dti_exit = B_FALSE; + p->dti_thr_func = thr_func; + + list_create(&p->dti_list, sizeof(wthr_info_t), offsetof(wthr_info_t, + wthr_node)); + list_create(&p->dti_join_list, sizeof(wthr_info_t), + offsetof(wthr_info_t, wthr_node)); + + pthread_mutex_lock(&p->dti_mtx); + error = dctl_thr_create(min_thr); + pthread_mutex_unlock(&p->dti_mtx); + + if (error) + dctl_thr_pool_stop(); + + return error; +} diff --git a/zfs/lib/libdmu-ctl/dmu_send.c b/zfs/lib/libdmu-ctl/dmu_send.c new file mode 100644 index 0000000000..1c72f95073 --- /dev/null +++ b/zfs/lib/libdmu-ctl/dmu_send.c @@ -0,0 +1,1249 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "@(#)dmu_send.c 1.14 08/04/27 SMI" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static char *dmu_recv_tag = "dmu_recv_tag"; + +struct backuparg { + dmu_replay_record_t *drr; + vnode_t *vp; + offset_t *off; + objset_t *os; + zio_cksum_t zc; + int err; +}; + +static int +dump_bytes(struct backuparg *ba, void *buf, int len) +{ + ssize_t resid; /* have to get resid to get detailed errno */ + ASSERT3U(len % 8, ==, 0); + + fletcher_4_incremental_native(buf, len, &ba->zc); + ba->err = vn_rdwr(UIO_WRITE, ba->vp, + (caddr_t)buf, len, + 0, UIO_SYSSPACE, FAPPEND, RLIM64_INFINITY, CRED(), &resid); + *ba->off += len; + return (ba->err); +} + +static int +dump_free(struct backuparg *ba, uint64_t object, uint64_t offset, + uint64_t length) +{ + /* write a FREE record */ + bzero(ba->drr, sizeof (dmu_replay_record_t)); + ba->drr->drr_type = DRR_FREE; + ba->drr->drr_u.drr_free.drr_object = object; + ba->drr->drr_u.drr_free.drr_offset = offset; + ba->drr->drr_u.drr_free.drr_length = length; + + if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t))) + return (EINTR); + return (0); +} + +static int +dump_data(struct backuparg *ba, dmu_object_type_t type, + uint64_t object, uint64_t offset, int blksz, void *data) +{ + /* write a DATA record */ + bzero(ba->drr, sizeof (dmu_replay_record_t)); + ba->drr->drr_type = DRR_WRITE; + ba->drr->drr_u.drr_write.drr_object = object; + ba->drr->drr_u.drr_write.drr_type = type; + ba->drr->drr_u.drr_write.drr_offset = offset; + ba->drr->drr_u.drr_write.drr_length = blksz; + + if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t))) + return (EINTR); + if (dump_bytes(ba, data, blksz)) + return (EINTR); + return (0); +} + +static int +dump_freeobjects(struct backuparg *ba, uint64_t firstobj, uint64_t numobjs) +{ + /* write a FREEOBJECTS record */ + bzero(ba->drr, sizeof (dmu_replay_record_t)); + ba->drr->drr_type = DRR_FREEOBJECTS; + ba->drr->drr_u.drr_freeobjects.drr_firstobj = firstobj; + ba->drr->drr_u.drr_freeobjects.drr_numobjs = numobjs; + + if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t))) + return (EINTR); + return (0); +} + +static int +dump_dnode(struct backuparg *ba, uint64_t object, dnode_phys_t *dnp) +{ + if (dnp == NULL || dnp->dn_type == DMU_OT_NONE) + return (dump_freeobjects(ba, object, 1)); + + /* write an OBJECT record */ + bzero(ba->drr, sizeof (dmu_replay_record_t)); + ba->drr->drr_type = DRR_OBJECT; + ba->drr->drr_u.drr_object.drr_object = object; + ba->drr->drr_u.drr_object.drr_type = dnp->dn_type; + ba->drr->drr_u.drr_object.drr_bonustype = dnp->dn_bonustype; + ba->drr->drr_u.drr_object.drr_blksz = + dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; + ba->drr->drr_u.drr_object.drr_bonuslen = dnp->dn_bonuslen; + ba->drr->drr_u.drr_object.drr_checksum = dnp->dn_checksum; + ba->drr->drr_u.drr_object.drr_compress = dnp->dn_compress; + + if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t))) + return (EINTR); + + if (dump_bytes(ba, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8))) + return (EINTR); + + /* free anything past the end of the file */ + if (dump_free(ba, object, (dnp->dn_maxblkid + 1) * + (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL)) + return (EINTR); + if (ba->err) + return (EINTR); + return (0); +} + +#define BP_SPAN(dnp, level) \ + (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \ + (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) + +static int +backup_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg) +{ + struct backuparg *ba = arg; + uint64_t object = bc->bc_bookmark.zb_object; + int level = bc->bc_bookmark.zb_level; + uint64_t blkid = bc->bc_bookmark.zb_blkid; + blkptr_t *bp = bc->bc_blkptr.blk_birth ? &bc->bc_blkptr : NULL; + dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE; + void *data = bc->bc_data; + int err = 0; + + if (issig(JUSTLOOKING) && issig(FORREAL)) + return (EINTR); + + ASSERT(data || bp == NULL); + + if (bp == NULL && object == 0) { + uint64_t span = BP_SPAN(bc->bc_dnode, level); + uint64_t dnobj = (blkid * span) >> DNODE_SHIFT; + err = dump_freeobjects(ba, dnobj, span >> DNODE_SHIFT); + } else if (bp == NULL) { + uint64_t span = BP_SPAN(bc->bc_dnode, level); + err = dump_free(ba, object, blkid * span, span); + } else if (data && level == 0 && type == DMU_OT_DNODE) { + dnode_phys_t *blk = data; + int i; + int blksz = BP_GET_LSIZE(bp); + + for (i = 0; i < blksz >> DNODE_SHIFT; i++) { + uint64_t dnobj = + (blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; + err = dump_dnode(ba, dnobj, blk+i); + if (err) + break; + } + } else if (level == 0 && + type != DMU_OT_DNODE && type != DMU_OT_OBJSET) { + int blksz = BP_GET_LSIZE(bp); + if (data == NULL) { + uint32_t aflags = ARC_WAIT; + arc_buf_t *abuf; + zbookmark_t zb; + + zb.zb_objset = ba->os->os->os_dsl_dataset->ds_object; + zb.zb_object = object; + zb.zb_level = level; + zb.zb_blkid = blkid; + (void) arc_read(NULL, spa, bp, + dmu_ot[type].ot_byteswap, arc_getbuf_func, &abuf, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_MUSTSUCCEED, + &aflags, &zb); + + if (abuf) { + err = dump_data(ba, type, object, blkid * blksz, + blksz, abuf->b_data); + (void) arc_buf_remove_ref(abuf, &abuf); + } + } else { + err = dump_data(ba, type, object, blkid * blksz, + blksz, data); + } + } + + ASSERT(err == 0 || err == EINTR); + return (err); +} + +int +dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, + vnode_t *vp, offset_t *off) +{ + dsl_dataset_t *ds = tosnap->os->os_dsl_dataset; + dsl_dataset_t *fromds = fromsnap ? fromsnap->os->os_dsl_dataset : NULL; + dmu_replay_record_t *drr; + struct backuparg ba; + int err; + uint64_t fromtxg = 0; + + /* tosnap must be a snapshot */ + if (ds->ds_phys->ds_next_snap_obj == 0) + return (EINVAL); + + /* fromsnap must be an earlier snapshot from the same fs as tosnap */ + if (fromds && (ds->ds_dir != fromds->ds_dir || + fromds->ds_phys->ds_creation_txg >= ds->ds_phys->ds_creation_txg)) + return (EXDEV); + + if (fromorigin) { + if (fromsnap) + return (EINVAL); + + if (ds->ds_dir->dd_phys->dd_origin_obj != NULL) { + dsl_pool_t *dp = ds->ds_dir->dd_pool; + rw_enter(&dp->dp_config_rwlock, RW_READER); + err = dsl_dataset_open_obj(dp, + ds->ds_dir->dd_phys->dd_origin_obj, NULL, + DS_MODE_NONE, FTAG, &fromds); + rw_exit(&dp->dp_config_rwlock); + if (err) + return (err); + } else { + fromorigin = B_FALSE; + } + } + + + drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); + drr->drr_type = DRR_BEGIN; + drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; + drr->drr_u.drr_begin.drr_version = DMU_BACKUP_STREAM_VERSION; + drr->drr_u.drr_begin.drr_creation_time = + ds->ds_phys->ds_creation_time; + drr->drr_u.drr_begin.drr_type = tosnap->os->os_phys->os_type; + if (fromorigin) + drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE; + drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid; + if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) + drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA; + + if (fromds) + drr->drr_u.drr_begin.drr_fromguid = fromds->ds_phys->ds_guid; + dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname); + + if (fromds) + fromtxg = fromds->ds_phys->ds_creation_txg; + if (fromorigin) + dsl_dataset_close(fromds, DS_MODE_NONE, FTAG); + + ba.drr = drr; + ba.vp = vp; + ba.os = tosnap; + ba.off = off; + ZIO_SET_CHECKSUM(&ba.zc, 0, 0, 0, 0); + + if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) { + kmem_free(drr, sizeof (dmu_replay_record_t)); + return (ba.err); + } + + err = traverse_dsl_dataset(ds, fromtxg, + ADVANCE_PRE | ADVANCE_HOLES | ADVANCE_DATA | ADVANCE_NOLOCK, + backup_cb, &ba); + + if (err) { + if (err == EINTR && ba.err) + err = ba.err; + kmem_free(drr, sizeof (dmu_replay_record_t)); + return (err); + } + + bzero(drr, sizeof (dmu_replay_record_t)); + drr->drr_type = DRR_END; + drr->drr_u.drr_end.drr_checksum = ba.zc; + + if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) { + kmem_free(drr, sizeof (dmu_replay_record_t)); + return (ba.err); + } + + kmem_free(drr, sizeof (dmu_replay_record_t)); + + return (0); +} + +struct recvbeginsyncarg { + const char *tofs; + const char *tosnap; + dsl_dataset_t *origin; + uint64_t fromguid; + dmu_objset_type_t type; + void *tag; + boolean_t force; + uint64_t dsflags; + char clonelastname[MAXNAMELEN]; + dsl_dataset_t *ds; /* the ds to recv into; returned from the syncfunc */ +}; + +static dsl_dataset_t * +recv_full_sync_impl(dsl_pool_t *dp, uint64_t dsobj, dmu_objset_type_t type, + cred_t *cr, dmu_tx_t *tx) +{ + dsl_dataset_t *ds; + + VERIFY(0 == dsl_dataset_open_obj(dp, dsobj, NULL, + DS_MODE_EXCLUSIVE, dmu_recv_tag, &ds)); + + if (type != DMU_OST_NONE) { + (void) dmu_objset_create_impl(dp->dp_spa, + ds, &ds->ds_phys->ds_bp, type, tx); + } + + spa_history_internal_log(LOG_DS_REPLAY_FULL_SYNC, + ds->ds_dir->dd_pool->dp_spa, tx, cr, "dataset = %lld", + ds->ds_phys->ds_dir_obj); + + return (ds); +} + +/* ARGSUSED */ +static int +recv_full_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dir_t *dd = arg1; + struct recvbeginsyncarg *rbsa = arg2; + objset_t *mos = dd->dd_pool->dp_meta_objset; + uint64_t val; + int err; + + err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj, + strrchr(rbsa->tofs, '/') + 1, sizeof (uint64_t), 1, &val); + + if (err != ENOENT) + return (err ? err : EEXIST); + + if (rbsa->origin) { + /* make sure it's a snap in the same pool */ + if (rbsa->origin->ds_dir->dd_pool != dd->dd_pool) + return (EXDEV); + if (rbsa->origin->ds_phys->ds_num_children == 0) + return (EINVAL); + if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid) + return (ENODEV); + } + + return (0); +} + +static void +recv_full_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +{ + dsl_dir_t *dd = arg1; + struct recvbeginsyncarg *rbsa = arg2; + uint64_t dsobj; + uint64_t flags = DS_FLAG_INCONSISTENT; + + flags |= rbsa->dsflags; + + dsobj = dsl_dataset_create_sync(dd, strrchr(rbsa->tofs, '/') + 1, + rbsa->origin, flags, cr, tx); + + rbsa->ds = recv_full_sync_impl(dd->dd_pool, dsobj, + rbsa->origin ? DMU_OST_NONE : rbsa->type, cr, tx); +} + +static int +recv_full_existing_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + struct recvbeginsyncarg *rbsa = arg2; + int err; + + /* must be a head ds */ + if (ds->ds_phys->ds_next_snap_obj != 0) + return (EINVAL); + + /* must not be a clone ds */ + if (ds->ds_prev != NULL) + return (EINVAL); + + err = dsl_dataset_destroy_check(ds, rbsa->tag, tx); + if (err) + return (err); + + if (rbsa->origin) { + /* make sure it's a snap in the same pool */ + if (rbsa->origin->ds_dir->dd_pool != ds->ds_dir->dd_pool) + return (EXDEV); + if (rbsa->origin->ds_phys->ds_num_children == 0) + return (EINVAL); + if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid) + return (ENODEV); + } + + return (0); +} + +static void +recv_full_existing_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + struct recvbeginsyncarg *rbsa = arg2; + dsl_dir_t *dd = ds->ds_dir; + uint64_t dsobj; + uint64_t flags = DS_FLAG_INCONSISTENT; + + flags |= rbsa->dsflags; + + /* + * NB: caller must provide an extra hold on the dsl_dir_t, so it + * won't go away when dsl_dataset_destroy_sync() closes the + * dataset. + */ + dsl_dataset_destroy_sync(ds, rbsa->tag, cr, tx); + + dsobj = dsl_dataset_create_sync_impl(dd, rbsa->origin, flags, tx); + + rbsa->ds = recv_full_sync_impl(dd->dd_pool, dsobj, + rbsa->origin ? DMU_OST_NONE : rbsa->type, cr, tx); +} + +/* ARGSUSED */ +static int +recv_incremental_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + struct recvbeginsyncarg *rbsa = arg2; + int err; + uint64_t val; + + /* must not have any changes since most recent snapshot */ + if (!rbsa->force && dsl_dataset_modified_since_lastsnap(ds)) + return (ETXTBSY); + + /* must already be a snapshot of this fs */ + if (ds->ds_phys->ds_prev_snap_obj == 0) + return (ENODEV); + + /* most recent snapshot must match fromguid */ + if (ds->ds_prev->ds_phys->ds_guid != rbsa->fromguid) + return (ENODEV); + + /* temporary clone name must not exist */ + err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, + ds->ds_dir->dd_phys->dd_child_dir_zapobj, + rbsa->clonelastname, 8, 1, &val); + if (err == 0) + return (EEXIST); + if (err != ENOENT) + return (err); + + /* new snapshot name must not exist */ + err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, + ds->ds_phys->ds_snapnames_zapobj, rbsa->tosnap, 8, 1, &val); + if (err == 0) + return (EEXIST); + if (err != ENOENT) + return (err); + return (0); +} + +/* ARGSUSED */ +static void +recv_online_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +{ + dsl_dataset_t *ohds = arg1; + struct recvbeginsyncarg *rbsa = arg2; + dsl_pool_t *dp = ohds->ds_dir->dd_pool; + dsl_dataset_t *ods, *cds; + uint64_t dsobj; + uint64_t flags = DS_FLAG_INCONSISTENT; + + flags |= rbsa->dsflags; + + /* create the temporary clone */ + VERIFY(0 == dsl_dataset_open_obj(dp, ohds->ds_phys->ds_prev_snap_obj, + NULL, DS_MODE_STANDARD, FTAG, &ods)); + dsobj = dsl_dataset_create_sync(ohds->ds_dir, + rbsa->clonelastname, ods, flags, cr, tx); + dsl_dataset_close(ods, DS_MODE_STANDARD, FTAG); + + /* open the temporary clone */ + VERIFY(0 == dsl_dataset_open_obj(dp, dsobj, NULL, + DS_MODE_EXCLUSIVE, dmu_recv_tag, &cds)); + + /* copy the refquota from the target fs to the clone */ + if (ohds->ds_quota > 0) + dsl_dataset_set_quota_sync(cds, &ohds->ds_quota, cr, tx); + + rbsa->ds = cds; + + spa_history_internal_log(LOG_DS_REPLAY_INC_SYNC, + dp->dp_spa, tx, cr, "dataset = %lld", + cds->ds_phys->ds_dir_obj); +} + +/* ARGSUSED */ +static void +recv_offline_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; + + spa_history_internal_log(LOG_DS_REPLAY_INC_SYNC, + ds->ds_dir->dd_pool->dp_spa, tx, cr, "dataset = %lld", + ds->ds_phys->ds_dir_obj); +} + +/* + * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin() + * succeeds; otherwise we will leak the holds on the datasets. + */ +int +dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb, + boolean_t force, objset_t *origin, boolean_t online, dmu_recv_cookie_t *drc) +{ + int err = 0; + boolean_t byteswap; + struct recvbeginsyncarg rbsa; + uint64_t version; + int flags; + dsl_dataset_t *ds; + + if (drrb->drr_magic == DMU_BACKUP_MAGIC) + byteswap = FALSE; + else if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) + byteswap = TRUE; + else + return (EINVAL); + + rbsa.tofs = tofs; + rbsa.tosnap = tosnap; + rbsa.origin = origin ? origin->os->os_dsl_dataset : NULL; + rbsa.fromguid = drrb->drr_fromguid; + rbsa.type = drrb->drr_type; + rbsa.tag = FTAG; + rbsa.dsflags = 0; + version = drrb->drr_version; + flags = drrb->drr_flags; + + if (byteswap) { + rbsa.type = BSWAP_32(rbsa.type); + rbsa.fromguid = BSWAP_64(rbsa.fromguid); + version = BSWAP_64(version); + flags = BSWAP_32(flags); + } + + if (version != DMU_BACKUP_STREAM_VERSION || + rbsa.type >= DMU_OST_NUMTYPES || + ((flags & DRR_FLAG_CLONE) && origin == NULL)) + return (EINVAL); + + if (flags & DRR_FLAG_CI_DATA) + rbsa.dsflags = DS_FLAG_CI_DATASET; + + bzero(drc, sizeof (dmu_recv_cookie_t)); + drc->drc_drrb = drrb; + drc->drc_tosnap = tosnap; + drc->drc_force = force; + + /* + * Process the begin in syncing context. + */ + if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE) && !online) { + /* offline incremental receive */ + err = dsl_dataset_open(tofs, + DS_MODE_EXCLUSIVE, dmu_recv_tag, &ds); + if (err) + return (err); + + /* + * Only do the rollback if the most recent snapshot + * matches the incremental source + */ + if (force) { + if (ds->ds_prev == NULL || + ds->ds_prev->ds_phys->ds_guid != + rbsa.fromguid) { + dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, + dmu_recv_tag); + return (ENODEV); + } + (void) dsl_dataset_rollback(ds, DMU_OST_NONE); + } + rbsa.force = B_FALSE; + err = dsl_sync_task_do(ds->ds_dir->dd_pool, + recv_incremental_check, + recv_offline_incremental_sync, + ds, &rbsa, 1); + if (err) { + dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, dmu_recv_tag); + return (err); + } + drc->drc_logical_ds = drc->drc_real_ds = ds; + } else if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE)) { + /* online incremental receive */ + + /* tmp clone name is: tofs/%tosnap" */ + (void) snprintf(rbsa.clonelastname, sizeof (rbsa.clonelastname), + "%%%s", tosnap); + + /* open the dataset we are logically receiving into */ + err = dsl_dataset_open(tofs, + DS_MODE_STANDARD, dmu_recv_tag, &ds); + if (err) + return (err); + + rbsa.force = force; + err = dsl_sync_task_do(ds->ds_dir->dd_pool, + recv_incremental_check, + recv_online_incremental_sync, ds, &rbsa, 5); + if (err) { + dsl_dataset_close(ds, DS_MODE_STANDARD, dmu_recv_tag); + return (err); + } + drc->drc_logical_ds = ds; + drc->drc_real_ds = rbsa.ds; + } else { + /* create new fs -- full backup or clone */ + dsl_dir_t *dd = NULL; + const char *tail; + + err = dsl_dir_open(tofs, FTAG, &dd, &tail); + if (err) + return (err); + if (tail == NULL) { + if (!force) { + dsl_dir_close(dd, FTAG); + return (EEXIST); + } + + rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); + err = dsl_dataset_open_obj(dd->dd_pool, + dd->dd_phys->dd_head_dataset_obj, NULL, + DS_MODE_EXCLUSIVE | DS_MODE_INCONSISTENT, + FTAG, &ds); + rw_exit(&dd->dd_pool->dp_config_rwlock); + if (err) { + dsl_dir_close(dd, FTAG); + return (err); + } + + err = dsl_sync_task_do(dd->dd_pool, + recv_full_existing_check, + recv_full_existing_sync, ds, &rbsa, 5); + /* if successful, sync task closes the ds for us */ + if (err) + dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); + } else { + err = dsl_sync_task_do(dd->dd_pool, recv_full_check, + recv_full_sync, dd, &rbsa, 5); + if (err) + return (err); + } + dsl_dir_close(dd, FTAG); + if (err) + return (err); + drc->drc_logical_ds = drc->drc_real_ds = rbsa.ds; + drc->drc_newfs = B_TRUE; + } + + /* downgrade our hold on the ds from EXCLUSIVE to PRIMARY */ + dsl_dataset_downgrade(drc->drc_real_ds, + DS_MODE_EXCLUSIVE, DS_MODE_PRIMARY); + + return (0); +} + +struct restorearg { + int err; + int byteswap; + vnode_t *vp; + char *buf; + uint64_t voff; + int bufsize; /* amount of memory allocated for buf */ + zio_cksum_t cksum; +}; + +static void * +restore_read(struct restorearg *ra, int len) +{ + void *rv; + int done = 0; + + /* some things will require 8-byte alignment, so everything must */ + ASSERT3U(len % 8, ==, 0); + + while (done < len) { + ssize_t resid; + + ra->err = vn_rdwr(UIO_READ, ra->vp, + (caddr_t)ra->buf + done, len - done, + ra->voff, UIO_SYSSPACE, FAPPEND, + RLIM64_INFINITY, CRED(), &resid); + + if (resid == len - done) + ra->err = EINVAL; + ra->voff += len - done - resid; + done = len - resid; + if (ra->err) + return (NULL); + } + + ASSERT3U(done, ==, len); + rv = ra->buf; + if (ra->byteswap) + fletcher_4_incremental_byteswap(rv, len, &ra->cksum); + else + fletcher_4_incremental_native(rv, len, &ra->cksum); + return (rv); +} + +static void +backup_byteswap(dmu_replay_record_t *drr) +{ +#define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X)) +#define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X)) + drr->drr_type = BSWAP_32(drr->drr_type); + drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen); + switch (drr->drr_type) { + case DRR_BEGIN: + DO64(drr_begin.drr_magic); + DO64(drr_begin.drr_version); + DO64(drr_begin.drr_creation_time); + DO32(drr_begin.drr_type); + DO32(drr_begin.drr_flags); + DO64(drr_begin.drr_toguid); + DO64(drr_begin.drr_fromguid); + break; + case DRR_OBJECT: + DO64(drr_object.drr_object); + /* DO64(drr_object.drr_allocation_txg); */ + DO32(drr_object.drr_type); + DO32(drr_object.drr_bonustype); + DO32(drr_object.drr_blksz); + DO32(drr_object.drr_bonuslen); + break; + case DRR_FREEOBJECTS: + DO64(drr_freeobjects.drr_firstobj); + DO64(drr_freeobjects.drr_numobjs); + break; + case DRR_WRITE: + DO64(drr_write.drr_object); + DO32(drr_write.drr_type); + DO64(drr_write.drr_offset); + DO64(drr_write.drr_length); + break; + case DRR_FREE: + DO64(drr_free.drr_object); + DO64(drr_free.drr_offset); + DO64(drr_free.drr_length); + break; + case DRR_END: + DO64(drr_end.drr_checksum.zc_word[0]); + DO64(drr_end.drr_checksum.zc_word[1]); + DO64(drr_end.drr_checksum.zc_word[2]); + DO64(drr_end.drr_checksum.zc_word[3]); + break; + } +#undef DO64 +#undef DO32 +} + +static int +restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) +{ + int err; + dmu_tx_t *tx; + + err = dmu_object_info(os, drro->drr_object, NULL); + + if (err != 0 && err != ENOENT) + return (EINVAL); + + if (drro->drr_type == DMU_OT_NONE || + drro->drr_type >= DMU_OT_NUMTYPES || + drro->drr_bonustype >= DMU_OT_NUMTYPES || + drro->drr_checksum >= ZIO_CHECKSUM_FUNCTIONS || + drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS || + P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || + drro->drr_blksz < SPA_MINBLOCKSIZE || + drro->drr_blksz > SPA_MAXBLOCKSIZE || + drro->drr_bonuslen > DN_MAX_BONUSLEN) { + return (EINVAL); + } + + tx = dmu_tx_create(os); + + if (err == ENOENT) { + /* currently free, want to be allocated */ + dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 1); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) { + dmu_tx_abort(tx); + return (err); + } + err = dmu_object_claim(os, drro->drr_object, + drro->drr_type, drro->drr_blksz, + drro->drr_bonustype, drro->drr_bonuslen, tx); + } else { + /* currently allocated, want to be allocated */ + dmu_tx_hold_bonus(tx, drro->drr_object); + /* + * We may change blocksize, so need to + * hold_write + */ + dmu_tx_hold_write(tx, drro->drr_object, 0, 1); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) { + dmu_tx_abort(tx); + return (err); + } + + err = dmu_object_reclaim(os, drro->drr_object, + drro->drr_type, drro->drr_blksz, + drro->drr_bonustype, drro->drr_bonuslen, tx); + } + if (err) { + dmu_tx_commit(tx); + return (EINVAL); + } + + dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksum, tx); + dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx); + + if (drro->drr_bonuslen) { + dmu_buf_t *db; + void *data; + VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db)); + dmu_buf_will_dirty(db, tx); + + ASSERT3U(db->db_size, >=, drro->drr_bonuslen); + data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8)); + if (data == NULL) { + dmu_tx_commit(tx); + return (ra->err); + } + bcopy(data, db->db_data, drro->drr_bonuslen); + if (ra->byteswap) { + dmu_ot[drro->drr_bonustype].ot_byteswap(db->db_data, + drro->drr_bonuslen); + } + dmu_buf_rele(db, FTAG); + } + dmu_tx_commit(tx); + return (0); +} + +/* ARGSUSED */ +static int +restore_freeobjects(struct restorearg *ra, objset_t *os, + struct drr_freeobjects *drrfo) +{ + uint64_t obj; + + if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj) + return (EINVAL); + + for (obj = drrfo->drr_firstobj; + obj < drrfo->drr_firstobj + drrfo->drr_numobjs; + (void) dmu_object_next(os, &obj, FALSE, 0)) { + dmu_tx_t *tx; + int err; + + if (dmu_object_info(os, obj, NULL) != 0) + continue; + + tx = dmu_tx_create(os); + dmu_tx_hold_bonus(tx, obj); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) { + dmu_tx_abort(tx); + return (err); + } + err = dmu_object_free(os, obj, tx); + dmu_tx_commit(tx); + if (err && err != ENOENT) + return (EINVAL); + } + return (0); +} + +static int +restore_write(struct restorearg *ra, objset_t *os, + struct drr_write *drrw) +{ + dmu_tx_t *tx; + void *data; + int err; + + if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset || + drrw->drr_type >= DMU_OT_NUMTYPES) + return (EINVAL); + + data = restore_read(ra, drrw->drr_length); + if (data == NULL) + return (ra->err); + + if (dmu_object_info(os, drrw->drr_object, NULL) != 0) + return (EINVAL); + + tx = dmu_tx_create(os); + + dmu_tx_hold_write(tx, drrw->drr_object, + drrw->drr_offset, drrw->drr_length); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) { + dmu_tx_abort(tx); + return (err); + } + if (ra->byteswap) + dmu_ot[drrw->drr_type].ot_byteswap(data, drrw->drr_length); + dmu_write(os, drrw->drr_object, + drrw->drr_offset, drrw->drr_length, data, tx); + dmu_tx_commit(tx); + return (0); +} + +/* ARGSUSED */ +static int +restore_free(struct restorearg *ra, objset_t *os, + struct drr_free *drrf) +{ + dmu_tx_t *tx; + int err; + + if (drrf->drr_length != -1ULL && + drrf->drr_offset + drrf->drr_length < drrf->drr_offset) + return (EINVAL); + + if (dmu_object_info(os, drrf->drr_object, NULL) != 0) + return (EINVAL); + + tx = dmu_tx_create(os); + + dmu_tx_hold_free(tx, drrf->drr_object, + drrf->drr_offset, drrf->drr_length); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) { + dmu_tx_abort(tx); + return (err); + } + err = dmu_free_range(os, drrf->drr_object, + drrf->drr_offset, drrf->drr_length, tx); + dmu_tx_commit(tx); + return (err); +} + +void +dmu_recv_abort_cleanup(dmu_recv_cookie_t *drc) +{ + if (drc->drc_newfs || drc->drc_real_ds != drc->drc_logical_ds) { + /* + * online incremental or new fs: destroy the fs (which + * may be a clone) that we created + */ + (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag); + if (drc->drc_real_ds != drc->drc_logical_ds) { + dsl_dataset_close(drc->drc_logical_ds, + DS_MODE_STANDARD, dmu_recv_tag); + } + } else { + /* + * offline incremental: rollback to most recent snapshot. + */ + int lmode = DS_MODE_PRIMARY; + if (dsl_dataset_tryupgrade(drc->drc_real_ds, + DS_MODE_PRIMARY, DS_MODE_EXCLUSIVE)) { + lmode = DS_MODE_EXCLUSIVE; + (void) dsl_dataset_rollback(drc->drc_real_ds, + DMU_OST_NONE); + } + dsl_dataset_close(drc->drc_real_ds, lmode, FTAG); + } +} + +/* + * NB: callers *must* call dmu_recv_end() if this succeeds. + */ +int +dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp) +{ + struct restorearg ra = { 0 }; + dmu_replay_record_t *drr; + objset_t *os; + zio_cksum_t pcksum; + + if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) + ra.byteswap = TRUE; + + { + /* compute checksum of drr_begin record */ + dmu_replay_record_t *drr; + drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); + + drr->drr_type = DRR_BEGIN; + drr->drr_u.drr_begin = *drc->drc_drrb; + if (ra.byteswap) { + fletcher_4_incremental_byteswap(drr, + sizeof (dmu_replay_record_t), &ra.cksum); + } else { + fletcher_4_incremental_native(drr, + sizeof (dmu_replay_record_t), &ra.cksum); + } + kmem_free(drr, sizeof (dmu_replay_record_t)); + } + + if (ra.byteswap) { + struct drr_begin *drrb = drc->drc_drrb; + drrb->drr_magic = BSWAP_64(drrb->drr_magic); + drrb->drr_version = BSWAP_64(drrb->drr_version); + drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); + drrb->drr_type = BSWAP_32(drrb->drr_type); + drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); + drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); + } + + ra.vp = vp; + ra.voff = *voffp; + ra.bufsize = 1<<20; + ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP); + + /* these were verified in dmu_recv_begin */ + ASSERT(drc->drc_drrb->drr_version == DMU_BACKUP_STREAM_VERSION); + ASSERT(drc->drc_drrb->drr_type < DMU_OST_NUMTYPES); + + /* + * Open the objset we are modifying. + */ + VERIFY(dmu_objset_open_ds(drc->drc_real_ds, DMU_OST_ANY, &os) == 0); + + ASSERT(drc->drc_real_ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT); + + /* + * Read records and process them. + */ + pcksum = ra.cksum; + while (ra.err == 0 && + NULL != (drr = restore_read(&ra, sizeof (*drr)))) { + if (issig(JUSTLOOKING) && issig(FORREAL)) { + ra.err = EINTR; + goto out; + } + + if (ra.byteswap) + backup_byteswap(drr); + + switch (drr->drr_type) { + case DRR_OBJECT: + { + /* + * We need to make a copy of the record header, + * because restore_{object,write} may need to + * restore_read(), which will invalidate drr. + */ + struct drr_object drro = drr->drr_u.drr_object; + ra.err = restore_object(&ra, os, &drro); + break; + } + case DRR_FREEOBJECTS: + { + struct drr_freeobjects drrfo = + drr->drr_u.drr_freeobjects; + ra.err = restore_freeobjects(&ra, os, &drrfo); + break; + } + case DRR_WRITE: + { + struct drr_write drrw = drr->drr_u.drr_write; + ra.err = restore_write(&ra, os, &drrw); + break; + } + case DRR_FREE: + { + struct drr_free drrf = drr->drr_u.drr_free; + ra.err = restore_free(&ra, os, &drrf); + break; + } + case DRR_END: + { + struct drr_end drre = drr->drr_u.drr_end; + /* + * We compare against the *previous* checksum + * value, because the stored checksum is of + * everything before the DRR_END record. + */ + if (!ZIO_CHECKSUM_EQUAL(drre.drr_checksum, pcksum)) + ra.err = ECKSUM; + goto out; + } + default: + ra.err = EINVAL; + goto out; + } + pcksum = ra.cksum; + } + ASSERT(ra.err != 0); + +out: + dmu_objset_close(os); + + if (ra.err != 0) { + /* + * rollback or destroy what we created, so we don't + * leave it in the restoring state. + */ + txg_wait_synced(drc->drc_real_ds->ds_dir->dd_pool, 0); + dmu_recv_abort_cleanup(drc); + } + + kmem_free(ra.buf, ra.bufsize); + *voffp = ra.voff; + return (ra.err); +} + +struct recvendsyncarg { + char *tosnap; + uint64_t creation_time; + uint64_t toguid; +}; + +static int +recv_end_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + struct recvendsyncarg *resa = arg2; + + return (dsl_dataset_snapshot_check(ds, resa->tosnap, tx)); +} + +static void +recv_end_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + struct recvendsyncarg *resa = arg2; + + dsl_dataset_snapshot_sync(ds, resa->tosnap, cr, tx); + + /* set snapshot's creation time and guid */ + dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); + ds->ds_prev->ds_phys->ds_creation_time = resa->creation_time; + ds->ds_prev->ds_phys->ds_guid = resa->toguid; + ds->ds_prev->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; + + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; +} + +int +dmu_recv_end(dmu_recv_cookie_t *drc) +{ + int err = 0; + int lmode; + + /* + * XXX hack; seems the ds is still dirty and + * dsl_pool_zil_clean() expects it to have a ds_user_ptr (and + * zil), but clone_swap() can close it. + */ + txg_wait_synced(drc->drc_real_ds->ds_dir->dd_pool, 0); + + if (dsl_dataset_tryupgrade(drc->drc_real_ds, + DS_MODE_PRIMARY, DS_MODE_EXCLUSIVE)) { + lmode = DS_MODE_EXCLUSIVE; + } else { + dmu_recv_abort_cleanup(drc); + return (EBUSY); + } + + if (drc->drc_logical_ds != drc->drc_real_ds) { + if (err == 0 && dsl_dataset_tryupgrade(drc->drc_logical_ds, + DS_MODE_STANDARD, DS_MODE_EXCLUSIVE)) { + lmode = DS_MODE_EXCLUSIVE; + err = dsl_dataset_clone_swap(drc->drc_real_ds, + drc->drc_logical_ds, drc->drc_force); + } else { + lmode = DS_MODE_STANDARD; + err = EBUSY; + } + } + + if (err == 0) { + struct recvendsyncarg resa; + + resa.creation_time = drc->drc_drrb->drr_creation_time; + resa.toguid = drc->drc_drrb->drr_toguid; + resa.tosnap = drc->drc_tosnap; + + err = dsl_sync_task_do(drc->drc_real_ds->ds_dir->dd_pool, + recv_end_check, recv_end_sync, + drc->drc_logical_ds, &resa, 3); + if (err) { + if (drc->drc_newfs) { + ASSERT(drc->drc_logical_ds == drc->drc_real_ds); + (void) dsl_dataset_destroy(drc->drc_real_ds, + dmu_recv_tag); + return (err); + } else { + (void) dsl_dataset_rollback(drc->drc_logical_ds, + DMU_OST_NONE); + } + } + } + + if (drc->drc_logical_ds != drc->drc_real_ds) { + /* dsl_dataset_destroy() will close the ds */ + (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag); + } + /* close the hold from dmu_recv_begin */ + dsl_dataset_close(drc->drc_logical_ds, lmode, dmu_recv_tag); + return (err); +} diff --git a/zfs/lib/libdmu-ctl/include/Makefile.in b/zfs/lib/libdmu-ctl/include/Makefile.in new file mode 100644 index 0000000000..6611e4143d --- /dev/null +++ b/zfs/lib/libdmu-ctl/include/Makefile.in @@ -0,0 +1 @@ +subdir-m += sys diff --git a/zfs/lib/libdmu-ctl/include/sys/Makefile.in b/zfs/lib/libdmu-ctl/include/sys/Makefile.in new file mode 100644 index 0000000000..8f97ae684e --- /dev/null +++ b/zfs/lib/libdmu-ctl/include/sys/Makefile.in @@ -0,0 +1 @@ +DISTFILES = dmu_ctl.h dmu_ctl_impl.h diff --git a/zfs/lib/libdmu-ctl/include/sys/dmu_ctl.h b/zfs/lib/libdmu-ctl/include/sys/dmu_ctl.h new file mode 100644 index 0000000000..c2044ba273 --- /dev/null +++ b/zfs/lib/libdmu-ctl/include/sys/dmu_ctl.h @@ -0,0 +1,71 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DMU_CTL_H +#define _SYS_DMU_CTL_H + +#include + +/* Default directory where the clients search for sockets to connect */ +#define DMU_CTL_DEFAULT_DIR "/var/run/zfs/udmu" + +/* + * These functions are called by the server process. + * + * kernel_init() must be called before dctl_server_init(). + * kernel_fini() must not be called before dctl_server_fini(). + * + * All objsets must be closed and object references be released before calling + * dctl_server_fini(), otherwise it will return EBUSY. + * + * Note: On Solaris, it is highly recommended to either catch or ignore the + * SIGPIPE signal, otherwise the server process will die if the client is + * killed. + */ +int dctl_server_init(const char *cfg_dir, int min_threads, + int max_free_threads); +int dctl_server_fini(); + +/* + * The following functions are called by the DMU from the server process context + * (in the worker threads). + */ +int dctls_copyin(const void *src, void *dest, size_t size); +int dctls_copyinstr(const char *from, char *to, size_t max, + size_t *len); +int dctls_copyout(const void *src, void *dest, size_t size); +int dctls_fd_read(int fd, void *buf, ssize_t len, ssize_t *residp); +int dctls_fd_write(int fd, const void *src, ssize_t len); + +/* + * These functions are called by the client process (libzfs). + */ +int dctlc_connect(const char *dir, boolean_t check_subdirs); +void dctlc_disconnect(int fd); + +int dctlc_ioctl(int fd, int32_t request, void *arg); + +#endif diff --git a/zfs/lib/libdmu-ctl/include/sys/dmu_ctl_impl.h b/zfs/lib/libdmu-ctl/include/sys/dmu_ctl_impl.h new file mode 100644 index 0000000000..6b4a564b31 --- /dev/null +++ b/zfs/lib/libdmu-ctl/include/sys/dmu_ctl_impl.h @@ -0,0 +1,144 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DMU_CTL_IMPL_H +#define _SYS_DMU_CTL_IMPL_H + +#include +#include +#include +#include +#include + +#define SOCKNAME "dmu_socket" + +#define DCTL_PROTOCOL_VER 1 +#define DCTL_MAGIC 0xdc71b1070c01dc71ll + +/* Message types */ +enum { + DCTL_IOCTL, + DCTL_IOCTL_REPLY, + DCTL_COPYIN, + DCTL_COPYINSTR, + DCTL_COPYOUT, + DCTL_FD_READ, + DCTL_FD_WRITE, + DCTL_GEN_REPLY /* generic reply */ +}; + +/* On-the-wire message */ +typedef struct dctl_cmd { + uint64_t dcmd_magic; + int8_t dcmd_version; + int8_t dcmd_msg; + uint8_t dcmd_pad[6]; + union { + struct dcmd_ioctl { + uint64_t arg; + int32_t cmd; + uint8_t pad[4]; + } dcmd_ioctl; + + struct dcmd_copy_req { + uint64_t ptr; + uint64_t size; + } dcmd_copy; + + struct dcmd_fd_req { + int64_t size; + int32_t fd; + uint8_t pad[4]; + } dcmd_fd_io; + + struct dcmd_reply { + uint64_t size; /* used by reply to DCTL_COPYINSTR, + DCTL_FD_READ and DCTL_FD_WRITE */ + int32_t rc; /* return code */ + uint8_t pad[4]; + } dcmd_reply; + } u; +} dctl_cmd_t; + +#define DCTL_CMD_HEADER_SIZE (sizeof(uint64_t) + sizeof(uint8_t)) + +/* + * The following definitions are only used by the server code. + */ + +#define LISTEN_BACKLOG 5 + +/* Worker thread data */ +typedef struct wthr_info { + list_node_t wthr_node; + pthread_t wthr_id; + boolean_t wthr_exit; /* termination flag */ + boolean_t wthr_free; +} wthr_info_t; + +/* Control socket data */ +typedef struct dctl_sock_info { + pthread_mutex_t dsi_mtx; + char *dsi_path; + struct sockaddr_un dsi_addr; + int dsi_fd; +} dctl_sock_info_t; + +typedef void *thr_func_t(void *); + +/* Thread pool data */ +typedef struct dctl_thr_info { + thr_func_t *dti_thr_func; + + pthread_mutex_t dti_mtx; /* protects the thread lists and dti_free */ + list_t dti_list; /* list of threads in the thread pool */ + list_t dti_join_list; /* list of threads that are waiting to be + joined */ + int dti_free; /* number of free worker threads */ + + int dti_min; + int dti_max_free; + + boolean_t dti_exit; /* global termination flag */ +} dctl_thr_info_t; + +/* Messaging functions functions */ +int dctl_read_msg(int fd, dctl_cmd_t *cmd); +int dctl_send_msg(int fd, dctl_cmd_t *cmd); + +int dctl_read_data(int fd, void *ptr, size_t size); +int dctl_send_data(int fd, const void *ptr, size_t size); + +/* Thread pool functions */ +int dctl_thr_pool_create(int min_thr, int max_free_thr, + thr_func_t *thr_func); +void dctl_thr_pool_stop(); + +void dctl_thr_join(); +void dctl_thr_die(wthr_info_t *thr); +void dctl_thr_rebalance(wthr_info_t *thr, boolean_t set_free); + +#endif diff --git a/zfs/lib/libdmu-ctl/rrwlock.c b/zfs/lib/libdmu-ctl/rrwlock.c new file mode 100644 index 0000000000..c46ed8155a --- /dev/null +++ b/zfs/lib/libdmu-ctl/rrwlock.c @@ -0,0 +1,249 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "@(#)rrwlock.c 1.1 07/10/24 SMI" + +#include +#include + +/* + * This file contains the implementation of a re-entrant read + * reader/writer lock (aka "rrwlock"). + * + * This is a normal reader/writer lock with the additional feature + * of allowing threads who have already obtained a read lock to + * re-enter another read lock (re-entrant read) - even if there are + * waiting writers. + * + * Callers who have not obtained a read lock give waiting writers priority. + * + * The rrwlock_t lock does not allow re-entrant writers, nor does it + * allow a re-entrant mix of reads and writes (that is, it does not + * allow a caller who has already obtained a read lock to be able to + * then grab a write lock without first dropping all read locks, and + * vice versa). + * + * The rrwlock_t uses tsd (thread specific data) to keep a list of + * nodes (rrw_node_t), where each node keeps track of which specific + * lock (rrw_node_t::rn_rrl) the thread has grabbed. Since re-entering + * should be rare, a thread that grabs multiple reads on the same rrwlock_t + * will store multiple rrw_node_ts of the same 'rrn_rrl'. Nodes on the + * tsd list can represent a different rrwlock_t. This allows a thread + * to enter multiple and unique rrwlock_ts for read locks at the same time. + * + * Since using tsd exposes some overhead, the rrwlock_t only needs to + * keep tsd data when writers are waiting. If no writers are waiting, then + * a reader just bumps the anonymous read count (rr_anon_rcount) - no tsd + * is needed. Once a writer attempts to grab the lock, readers then + * keep tsd data and bump the linked readers count (rr_linked_rcount). + * + * If there are waiting writers and there are anonymous readers, then a + * reader doesn't know if it is a re-entrant lock. But since it may be one, + * we allow the read to proceed (otherwise it could deadlock). Since once + * waiting writers are active, readers no longer bump the anonymous count, + * the anonymous readers will eventually flush themselves out. At this point, + * readers will be able to tell if they are a re-entrant lock (have a + * rrw_node_t entry for the lock) or not. If they are a re-entrant lock, then + * we must let the proceed. If they are not, then the reader blocks for the + * waiting writers. Hence, we do not starve writers. + */ + +/* global key for TSD */ +uint_t rrw_tsd_key; + +typedef struct rrw_node { + struct rrw_node *rn_next; + rrwlock_t *rn_rrl; +} rrw_node_t; + +static rrw_node_t * +rrn_find(rrwlock_t *rrl) +{ + rrw_node_t *rn; + + if (refcount_count(&rrl->rr_linked_rcount) == 0) + return (NULL); + + for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) { + if (rn->rn_rrl == rrl) + return (rn); + } + return (NULL); +} + +/* + * Add a node to the head of the singly linked list. + */ +static void +rrn_add(rrwlock_t *rrl) +{ + rrw_node_t *rn; + + rn = kmem_alloc(sizeof (*rn), KM_SLEEP); + rn->rn_rrl = rrl; + rn->rn_next = tsd_get(rrw_tsd_key); + VERIFY(tsd_set(rrw_tsd_key, rn) == 0); +} + +/* + * If a node is found for 'rrl', then remove the node from this + * thread's list and return TRUE; otherwise return FALSE. + */ +static boolean_t +rrn_find_and_remove(rrwlock_t *rrl) +{ + rrw_node_t *rn; + rrw_node_t *prev = NULL; + + if (refcount_count(&rrl->rr_linked_rcount) == 0) + return (NULL); + + for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) { + if (rn->rn_rrl == rrl) { + if (prev) + prev->rn_next = rn->rn_next; + else + VERIFY(tsd_set(rrw_tsd_key, rn->rn_next) == 0); + kmem_free(rn, sizeof (*rn)); + return (B_TRUE); + } + prev = rn; + } + return (B_FALSE); +} + +void +rrw_init(rrwlock_t *rrl) +{ + mutex_init(&rrl->rr_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&rrl->rr_cv, NULL, CV_DEFAULT, NULL); + rrl->rr_writer = NULL; + refcount_create(&rrl->rr_anon_rcount); + refcount_create(&rrl->rr_linked_rcount); + rrl->rr_writer_wanted = B_FALSE; +} + +void +rrw_destroy(rrwlock_t *rrl) +{ + mutex_destroy(&rrl->rr_lock); + cv_destroy(&rrl->rr_cv); + ASSERT(rrl->rr_writer == NULL); + refcount_destroy(&rrl->rr_anon_rcount); + refcount_destroy(&rrl->rr_linked_rcount); +} + +static void +rrw_enter_read(rrwlock_t *rrl, void *tag) +{ + mutex_enter(&rrl->rr_lock); + ASSERT(rrl->rr_writer != curthread); + ASSERT(refcount_count(&rrl->rr_anon_rcount) >= 0); + + while (rrl->rr_writer || (rrl->rr_writer_wanted && + refcount_is_zero(&rrl->rr_anon_rcount) && + rrn_find(rrl) == NULL)) + cv_wait(&rrl->rr_cv, &rrl->rr_lock); + + if (rrl->rr_writer_wanted) { + /* may or may not be a re-entrant enter */ + rrn_add(rrl); + (void) refcount_add(&rrl->rr_linked_rcount, tag); + } else { + (void) refcount_add(&rrl->rr_anon_rcount, tag); + } + ASSERT(rrl->rr_writer == NULL); + mutex_exit(&rrl->rr_lock); +} + +static void +rrw_enter_write(rrwlock_t *rrl) +{ + mutex_enter(&rrl->rr_lock); + ASSERT(rrl->rr_writer != curthread); + + while (refcount_count(&rrl->rr_anon_rcount) > 0 || + refcount_count(&rrl->rr_linked_rcount) > 0 || + rrl->rr_writer != NULL) { + rrl->rr_writer_wanted = B_TRUE; + cv_wait(&rrl->rr_cv, &rrl->rr_lock); + } + rrl->rr_writer_wanted = B_FALSE; + rrl->rr_writer = curthread; + mutex_exit(&rrl->rr_lock); +} + +void +rrw_enter(rrwlock_t *rrl, krw_t rw, void *tag) +{ + if (rw == RW_READER) + rrw_enter_read(rrl, tag); + else + rrw_enter_write(rrl); +} + +void +rrw_exit(rrwlock_t *rrl, void *tag) +{ + mutex_enter(&rrl->rr_lock); + ASSERT(!refcount_is_zero(&rrl->rr_anon_rcount) || + !refcount_is_zero(&rrl->rr_linked_rcount) || + rrl->rr_writer != NULL); + + if (rrl->rr_writer == NULL) { + if (rrn_find_and_remove(rrl)) { + if (refcount_remove(&rrl->rr_linked_rcount, tag) == 0) + cv_broadcast(&rrl->rr_cv); + + } else { + if (refcount_remove(&rrl->rr_anon_rcount, tag) == 0) + cv_broadcast(&rrl->rr_cv); + } + } else { + ASSERT(rrl->rr_writer == curthread); + ASSERT(refcount_is_zero(&rrl->rr_anon_rcount) && + refcount_is_zero(&rrl->rr_linked_rcount)); + rrl->rr_writer = NULL; + cv_broadcast(&rrl->rr_cv); + } + mutex_exit(&rrl->rr_lock); +} + +boolean_t +rrw_held(rrwlock_t *rrl, krw_t rw) +{ + boolean_t held; + + mutex_enter(&rrl->rr_lock); + if (rw == RW_WRITER) { + held = (rrl->rr_writer == curthread); + } else { + held = (!refcount_is_zero(&rrl->rr_anon_rcount) || + !refcount_is_zero(&rrl->rr_linked_rcount)); + } + mutex_exit(&rrl->rr_lock); + + return (held); +} diff --git a/zfs/lib/libdmu-ctl/zfs_acl.c b/zfs/lib/libdmu-ctl/zfs_acl.c new file mode 100644 index 0000000000..cc2f97e1b7 --- /dev/null +++ b/zfs/lib/libdmu-ctl/zfs_acl.c @@ -0,0 +1,2641 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "@(#)zfs_acl.c 1.25 08/04/08 SMI" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "fs/fs_subr.h" +#include + +#define ALLOW ACE_ACCESS_ALLOWED_ACE_TYPE +#define DENY ACE_ACCESS_DENIED_ACE_TYPE +#define MAX_ACE_TYPE ACE_SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE + +#define OWNING_GROUP (ACE_GROUP|ACE_IDENTIFIER_GROUP) +#define EVERYONE_ALLOW_MASK (ACE_READ_ACL|ACE_READ_ATTRIBUTES | \ + ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE) +#define EVERYONE_DENY_MASK (ACE_WRITE_ACL|ACE_WRITE_OWNER | \ + ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS) +#define OWNER_ALLOW_MASK (ACE_WRITE_ACL | ACE_WRITE_OWNER | \ + ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS) +#define WRITE_MASK_DATA (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS) + +#define ZFS_CHECKED_MASKS (ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_DATA| \ + ACE_READ_NAMED_ATTRS|ACE_WRITE_DATA|ACE_WRITE_ATTRIBUTES| \ + ACE_WRITE_NAMED_ATTRS|ACE_APPEND_DATA|ACE_EXECUTE|ACE_WRITE_OWNER| \ + ACE_WRITE_ACL|ACE_DELETE|ACE_DELETE_CHILD|ACE_SYNCHRONIZE) + +#define WRITE_MASK (WRITE_MASK_DATA|ACE_WRITE_ATTRIBUTES|ACE_WRITE_ACL|\ + ACE_WRITE_OWNER) + +#define OGE_CLEAR (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ + ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE) + +#define OKAY_MASK_BITS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ + ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE) + +#define ALL_INHERIT (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE | \ + ACE_NO_PROPAGATE_INHERIT_ACE|ACE_INHERIT_ONLY_ACE|ACE_INHERITED_ACE) + +#define RESTRICTED_CLEAR (ACE_WRITE_ACL|ACE_WRITE_OWNER) + +#define V4_ACL_WIDE_FLAGS (ZFS_ACL_AUTO_INHERIT|ZFS_ACL_DEFAULTED|\ + ZFS_ACL_PROTECTED) + +#define ZFS_ACL_WIDE_FLAGS (V4_ACL_WIDE_FLAGS|ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|\ + ZFS_ACL_OBJ_ACE) + +static uint16_t +zfs_ace_v0_get_type(void *acep) +{ + return (((zfs_oldace_t *)acep)->z_type); +} + +static uint16_t +zfs_ace_v0_get_flags(void *acep) +{ + return (((zfs_oldace_t *)acep)->z_flags); +} + +static uint32_t +zfs_ace_v0_get_mask(void *acep) +{ + return (((zfs_oldace_t *)acep)->z_access_mask); +} + +static uint64_t +zfs_ace_v0_get_who(void *acep) +{ + return (((zfs_oldace_t *)acep)->z_fuid); +} + +static void +zfs_ace_v0_set_type(void *acep, uint16_t type) +{ + ((zfs_oldace_t *)acep)->z_type = type; +} + +static void +zfs_ace_v0_set_flags(void *acep, uint16_t flags) +{ + ((zfs_oldace_t *)acep)->z_flags = flags; +} + +static void +zfs_ace_v0_set_mask(void *acep, uint32_t mask) +{ + ((zfs_oldace_t *)acep)->z_access_mask = mask; +} + +static void +zfs_ace_v0_set_who(void *acep, uint64_t who) +{ + ((zfs_oldace_t *)acep)->z_fuid = who; +} + +/*ARGSUSED*/ +static size_t +zfs_ace_v0_size(void *acep) +{ + return (sizeof (zfs_oldace_t)); +} + +static size_t +zfs_ace_v0_abstract_size(void) +{ + return (sizeof (zfs_oldace_t)); +} + +static int +zfs_ace_v0_mask_off(void) +{ + return (offsetof(zfs_oldace_t, z_access_mask)); +} + +/*ARGSUSED*/ +static int +zfs_ace_v0_data(void *acep, void **datap) +{ + *datap = NULL; + return (0); +} + +static acl_ops_t zfs_acl_v0_ops = { + zfs_ace_v0_get_mask, + zfs_ace_v0_set_mask, + zfs_ace_v0_get_flags, + zfs_ace_v0_set_flags, + zfs_ace_v0_get_type, + zfs_ace_v0_set_type, + zfs_ace_v0_get_who, + zfs_ace_v0_set_who, + zfs_ace_v0_size, + zfs_ace_v0_abstract_size, + zfs_ace_v0_mask_off, + zfs_ace_v0_data +}; + +static uint16_t +zfs_ace_fuid_get_type(void *acep) +{ + return (((zfs_ace_hdr_t *)acep)->z_type); +} + +static uint16_t +zfs_ace_fuid_get_flags(void *acep) +{ + return (((zfs_ace_hdr_t *)acep)->z_flags); +} + +static uint32_t +zfs_ace_fuid_get_mask(void *acep) +{ + return (((zfs_ace_hdr_t *)acep)->z_access_mask); +} + +static uint64_t +zfs_ace_fuid_get_who(void *args) +{ + uint16_t entry_type; + zfs_ace_t *acep = args; + + entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS; + + if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE) + return (-1); + return (((zfs_ace_t *)acep)->z_fuid); +} + +static void +zfs_ace_fuid_set_type(void *acep, uint16_t type) +{ + ((zfs_ace_hdr_t *)acep)->z_type = type; +} + +static void +zfs_ace_fuid_set_flags(void *acep, uint16_t flags) +{ + ((zfs_ace_hdr_t *)acep)->z_flags = flags; +} + +static void +zfs_ace_fuid_set_mask(void *acep, uint32_t mask) +{ + ((zfs_ace_hdr_t *)acep)->z_access_mask = mask; +} + +static void +zfs_ace_fuid_set_who(void *arg, uint64_t who) +{ + zfs_ace_t *acep = arg; + + uint16_t entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS; + + if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE) + return; + acep->z_fuid = who; +} + +static size_t +zfs_ace_fuid_size(void *acep) +{ + zfs_ace_hdr_t *zacep = acep; + uint16_t entry_type; + + switch (zacep->z_type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + return (sizeof (zfs_object_ace_t)); + case ALLOW: + case DENY: + entry_type = + (((zfs_ace_hdr_t *)acep)->z_flags & ACE_TYPE_FLAGS); + if (entry_type == ACE_OWNER || + entry_type == (ACE_GROUP | ACE_IDENTIFIER_GROUP) || + entry_type == ACE_EVERYONE) + return (sizeof (zfs_ace_hdr_t)); + /*FALLTHROUGH*/ + default: + return (sizeof (zfs_ace_t)); + } +} + +static size_t +zfs_ace_fuid_abstract_size(void) +{ + return (sizeof (zfs_ace_hdr_t)); +} + +static int +zfs_ace_fuid_mask_off(void) +{ + return (offsetof(zfs_ace_hdr_t, z_access_mask)); +} + +static int +zfs_ace_fuid_data(void *acep, void **datap) +{ + zfs_ace_t *zacep = acep; + zfs_object_ace_t *zobjp; + + switch (zacep->z_hdr.z_type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + zobjp = acep; + *datap = (caddr_t)zobjp + sizeof (zfs_ace_t); + return (sizeof (zfs_object_ace_t) - sizeof (zfs_ace_t)); + default: + *datap = NULL; + return (0); + } +} + +static acl_ops_t zfs_acl_fuid_ops = { + zfs_ace_fuid_get_mask, + zfs_ace_fuid_set_mask, + zfs_ace_fuid_get_flags, + zfs_ace_fuid_set_flags, + zfs_ace_fuid_get_type, + zfs_ace_fuid_set_type, + zfs_ace_fuid_get_who, + zfs_ace_fuid_set_who, + zfs_ace_fuid_size, + zfs_ace_fuid_abstract_size, + zfs_ace_fuid_mask_off, + zfs_ace_fuid_data +}; + +static int +zfs_acl_version(int version) +{ + if (version < ZPL_VERSION_FUID) + return (ZFS_ACL_VERSION_INITIAL); + else + return (ZFS_ACL_VERSION_FUID); +} + +static int +zfs_acl_version_zp(znode_t *zp) +{ + return (zfs_acl_version(zp->z_zfsvfs->z_version)); +} + +static zfs_acl_t * +zfs_acl_alloc(int vers) +{ + zfs_acl_t *aclp; + + aclp = kmem_zalloc(sizeof (zfs_acl_t), KM_SLEEP); + list_create(&aclp->z_acl, sizeof (zfs_acl_node_t), + offsetof(zfs_acl_node_t, z_next)); + aclp->z_version = vers; + if (vers == ZFS_ACL_VERSION_FUID) + aclp->z_ops = zfs_acl_fuid_ops; + else + aclp->z_ops = zfs_acl_v0_ops; + return (aclp); +} + +static zfs_acl_node_t * +zfs_acl_node_alloc(size_t bytes) +{ + zfs_acl_node_t *aclnode; + + aclnode = kmem_zalloc(sizeof (zfs_acl_node_t), KM_SLEEP); + if (bytes) { + aclnode->z_acldata = kmem_alloc(bytes, KM_SLEEP); + aclnode->z_allocdata = aclnode->z_acldata; + aclnode->z_allocsize = bytes; + aclnode->z_size = bytes; + } + + return (aclnode); +} + +static void +zfs_acl_node_free(zfs_acl_node_t *aclnode) +{ + if (aclnode->z_allocsize) + kmem_free(aclnode->z_allocdata, aclnode->z_allocsize); + kmem_free(aclnode, sizeof (zfs_acl_node_t)); +} + +static void +zfs_acl_release_nodes(zfs_acl_t *aclp) +{ + zfs_acl_node_t *aclnode; + + while (aclnode = list_head(&aclp->z_acl)) { + list_remove(&aclp->z_acl, aclnode); + zfs_acl_node_free(aclnode); + } + aclp->z_acl_count = 0; + aclp->z_acl_bytes = 0; +} + +void +zfs_acl_free(zfs_acl_t *aclp) +{ + zfs_acl_release_nodes(aclp); + list_destroy(&aclp->z_acl); + kmem_free(aclp, sizeof (zfs_acl_t)); +} + +static boolean_t +zfs_ace_valid(vtype_t obj_type, zfs_acl_t *aclp, uint16_t type, uint16_t iflags) +{ + /* + * first check type of entry + */ + + switch (iflags & ACE_TYPE_FLAGS) { + case ACE_OWNER: + case (ACE_IDENTIFIER_GROUP | ACE_GROUP): + case ACE_IDENTIFIER_GROUP: + case ACE_EVERYONE: + case 0: /* User entry */ + break; + default: + return (B_FALSE); + + } + + /* + * next check inheritance level flags + */ + + if (type != ALLOW && type > MAX_ACE_TYPE) { + return (B_FALSE); + } + + switch (type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + if (aclp->z_version < ZFS_ACL_VERSION_FUID) + return (B_FALSE); + aclp->z_hints |= ZFS_ACL_OBJ_ACE; + } + + /* + * Only directories should have inheritance flags. + */ + if (obj_type != VDIR && (iflags & + (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE| + ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE))) { + return (B_FALSE); + } + + if (iflags & (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE)) + aclp->z_hints |= ZFS_INHERIT_ACE; + + if (iflags & (ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE)) { + if ((iflags & (ACE_FILE_INHERIT_ACE| + ACE_DIRECTORY_INHERIT_ACE)) == 0) { + return (B_FALSE); + } + } + + return (B_TRUE); +} + +static void * +zfs_acl_next_ace(zfs_acl_t *aclp, void *start, uint64_t *who, + uint32_t *access_mask, uint16_t *iflags, uint16_t *type) +{ + zfs_acl_node_t *aclnode; + + if (start == NULL) { + aclnode = list_head(&aclp->z_acl); + if (aclnode == NULL) + return (NULL); + + aclp->z_next_ace = aclnode->z_acldata; + aclp->z_curr_node = aclnode; + aclnode->z_ace_idx = 0; + } + + aclnode = aclp->z_curr_node; + + if (aclnode == NULL) + return (NULL); + + if (aclnode->z_ace_idx >= aclnode->z_ace_count) { + aclnode = list_next(&aclp->z_acl, aclnode); + if (aclnode == NULL) + return (NULL); + else { + aclp->z_curr_node = aclnode; + aclnode->z_ace_idx = 0; + aclp->z_next_ace = aclnode->z_acldata; + } + } + + if (aclnode->z_ace_idx < aclnode->z_ace_count) { + void *acep = aclp->z_next_ace; + *iflags = aclp->z_ops.ace_flags_get(acep); + *type = aclp->z_ops.ace_type_get(acep); + *access_mask = aclp->z_ops.ace_mask_get(acep); + *who = aclp->z_ops.ace_who_get(acep); + aclp->z_next_ace = (caddr_t)aclp->z_next_ace + + aclp->z_ops.ace_size(acep); + aclnode->z_ace_idx++; + return ((void *)acep); + } + return (NULL); +} + +/*ARGSUSED*/ +static uint64_t +zfs_ace_walk(void *datap, uint64_t cookie, int aclcnt, + uint16_t *flags, uint16_t *type, uint32_t *mask) +{ + zfs_acl_t *aclp = datap; + zfs_ace_hdr_t *acep = (zfs_ace_hdr_t *)(uintptr_t)cookie; + uint64_t who; + + acep = zfs_acl_next_ace(aclp, acep, &who, mask, + flags, type); + return ((uint64_t)(uintptr_t)acep); +} + +static zfs_acl_node_t * +zfs_acl_curr_node(zfs_acl_t *aclp) +{ + ASSERT(aclp->z_curr_node); + return (aclp->z_curr_node); +} + +/* + * Copy ACE to internal ZFS format. + * While processing the ACL each ACE will be validated for correctness. + * ACE FUIDs will be created later. + */ +int +zfs_copy_ace_2_fuid(vtype_t obj_type, zfs_acl_t *aclp, void *datap, + zfs_ace_t *z_acl, int aclcnt, size_t *size) +{ + int i; + uint16_t entry_type; + zfs_ace_t *aceptr = z_acl; + ace_t *acep = datap; + zfs_object_ace_t *zobjacep; + ace_object_t *aceobjp; + + for (i = 0; i != aclcnt; i++) { + aceptr->z_hdr.z_access_mask = acep->a_access_mask; + aceptr->z_hdr.z_flags = acep->a_flags; + aceptr->z_hdr.z_type = acep->a_type; + entry_type = aceptr->z_hdr.z_flags & ACE_TYPE_FLAGS; + if (entry_type != ACE_OWNER && entry_type != OWNING_GROUP && + entry_type != ACE_EVERYONE) { + if (!aclp->z_has_fuids) + aclp->z_has_fuids = IS_EPHEMERAL(acep->a_who); + aceptr->z_fuid = (uint64_t)acep->a_who; + } + + /* + * Make sure ACE is valid + */ + if (zfs_ace_valid(obj_type, aclp, aceptr->z_hdr.z_type, + aceptr->z_hdr.z_flags) != B_TRUE) + return (EINVAL); + + switch (acep->a_type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + zobjacep = (zfs_object_ace_t *)aceptr; + aceobjp = (ace_object_t *)acep; + + bcopy(aceobjp->a_obj_type, zobjacep->z_object_type, + sizeof (aceobjp->a_obj_type)); + bcopy(aceobjp->a_inherit_obj_type, + zobjacep->z_inherit_type, + sizeof (aceobjp->a_inherit_obj_type)); + acep = (ace_t *)((caddr_t)acep + sizeof (ace_object_t)); + break; + default: + acep = (ace_t *)((caddr_t)acep + sizeof (ace_t)); + } + + aceptr = (zfs_ace_t *)((caddr_t)aceptr + + aclp->z_ops.ace_size(aceptr)); + } + + *size = (caddr_t)aceptr - (caddr_t)z_acl; + + return (0); +} + +/* + * Copy ZFS ACEs to fixed size ace_t layout + */ +static void +zfs_copy_fuid_2_ace(zfsvfs_t *zfsvfs, zfs_acl_t *aclp, cred_t *cr, + void *datap, int filter) +{ + uint64_t who; + uint32_t access_mask; + uint16_t iflags, type; + zfs_ace_hdr_t *zacep = NULL; + ace_t *acep = datap; + ace_object_t *objacep; + zfs_object_ace_t *zobjacep; + size_t ace_size; + uint16_t entry_type; + + while (zacep = zfs_acl_next_ace(aclp, zacep, + &who, &access_mask, &iflags, &type)) { + + switch (type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + if (filter) { + continue; + } + zobjacep = (zfs_object_ace_t *)zacep; + objacep = (ace_object_t *)acep; + bcopy(zobjacep->z_object_type, + objacep->a_obj_type, + sizeof (zobjacep->z_object_type)); + bcopy(zobjacep->z_inherit_type, + objacep->a_inherit_obj_type, + sizeof (zobjacep->z_inherit_type)); + ace_size = sizeof (ace_object_t); + break; + default: + ace_size = sizeof (ace_t); + break; + } + + entry_type = (iflags & ACE_TYPE_FLAGS); + if ((entry_type != ACE_OWNER && + entry_type != (ACE_GROUP | ACE_IDENTIFIER_GROUP) && + entry_type != ACE_EVERYONE)) { + acep->a_who = zfs_fuid_map_id(zfsvfs, who, + cr, (entry_type & ACE_IDENTIFIER_GROUP) ? + ZFS_ACE_GROUP : ZFS_ACE_USER); + } else { + acep->a_who = (uid_t)(int64_t)who; + } + acep->a_access_mask = access_mask; + acep->a_flags = iflags; + acep->a_type = type; + acep = (ace_t *)((caddr_t)acep + ace_size); + } +} + +static int +zfs_copy_ace_2_oldace(vtype_t obj_type, zfs_acl_t *aclp, ace_t *acep, + zfs_oldace_t *z_acl, int aclcnt, size_t *size) +{ + int i; + zfs_oldace_t *aceptr = z_acl; + + for (i = 0; i != aclcnt; i++, aceptr++) { + aceptr->z_access_mask = acep[i].a_access_mask; + aceptr->z_type = acep[i].a_type; + aceptr->z_flags = acep[i].a_flags; + aceptr->z_fuid = acep[i].a_who; + /* + * Make sure ACE is valid + */ + if (zfs_ace_valid(obj_type, aclp, aceptr->z_type, + aceptr->z_flags) != B_TRUE) + return (EINVAL); + } + *size = (caddr_t)aceptr - (caddr_t)z_acl; + return (0); +} + +/* + * convert old ACL format to new + */ +void +zfs_acl_xform(znode_t *zp, zfs_acl_t *aclp) +{ + zfs_oldace_t *oldaclp; + int i; + uint16_t type, iflags; + uint32_t access_mask; + uint64_t who; + void *cookie = NULL; + zfs_acl_node_t *newaclnode; + + ASSERT(aclp->z_version == ZFS_ACL_VERSION_INITIAL); + /* + * First create the ACE in a contiguous piece of memory + * for zfs_copy_ace_2_fuid(). + * + * We only convert an ACL once, so this won't happen + * everytime. + */ + oldaclp = kmem_alloc(sizeof (zfs_oldace_t) * aclp->z_acl_count, + KM_SLEEP); + i = 0; + while (cookie = zfs_acl_next_ace(aclp, cookie, &who, + &access_mask, &iflags, &type)) { + oldaclp[i].z_flags = iflags; + oldaclp[i].z_type = type; + oldaclp[i].z_fuid = who; + oldaclp[i++].z_access_mask = access_mask; + } + + newaclnode = zfs_acl_node_alloc(aclp->z_acl_count * + sizeof (zfs_object_ace_t)); + aclp->z_ops = zfs_acl_fuid_ops; + VERIFY(zfs_copy_ace_2_fuid(ZTOV(zp)->v_type, aclp, oldaclp, + newaclnode->z_acldata, aclp->z_acl_count, + &newaclnode->z_size) == 0); + newaclnode->z_ace_count = aclp->z_acl_count; + aclp->z_version = ZFS_ACL_VERSION; + kmem_free(oldaclp, aclp->z_acl_count * sizeof (zfs_oldace_t)); + + /* + * Release all previous ACL nodes + */ + + zfs_acl_release_nodes(aclp); + + list_insert_head(&aclp->z_acl, newaclnode); + + aclp->z_acl_bytes = newaclnode->z_size; + aclp->z_acl_count = newaclnode->z_ace_count; + +} + +/* + * Convert unix access mask to v4 access mask + */ +static uint32_t +zfs_unix_to_v4(uint32_t access_mask) +{ + uint32_t new_mask = 0; + + if (access_mask & S_IXOTH) + new_mask |= ACE_EXECUTE; + if (access_mask & S_IWOTH) + new_mask |= ACE_WRITE_DATA; + if (access_mask & S_IROTH) + new_mask |= ACE_READ_DATA; + return (new_mask); +} + +static void +zfs_set_ace(zfs_acl_t *aclp, void *acep, uint32_t access_mask, + uint16_t access_type, uint64_t fuid, uint16_t entry_type) +{ + uint16_t type = entry_type & ACE_TYPE_FLAGS; + + aclp->z_ops.ace_mask_set(acep, access_mask); + aclp->z_ops.ace_type_set(acep, access_type); + aclp->z_ops.ace_flags_set(acep, entry_type); + if ((type != ACE_OWNER && type != (ACE_GROUP | ACE_IDENTIFIER_GROUP) && + type != ACE_EVERYONE)) + aclp->z_ops.ace_who_set(acep, fuid); +} + +/* + * Determine mode of file based on ACL. + * Also, create FUIDs for any User/Group ACEs + */ +static uint64_t +zfs_mode_fuid_compute(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, + zfs_fuid_info_t **fuidp, dmu_tx_t *tx) +{ + int entry_type; + mode_t mode; + mode_t seen = 0; + zfs_ace_hdr_t *acep = NULL; + uint64_t who; + uint16_t iflags, type; + uint32_t access_mask; + + mode = (zp->z_phys->zp_mode & (S_IFMT | S_ISUID | S_ISGID | S_ISVTX)); + + while (acep = zfs_acl_next_ace(aclp, acep, &who, + &access_mask, &iflags, &type)) { + + /* + * Skip over inherit only ACEs + */ + if (iflags & ACE_INHERIT_ONLY_ACE) + continue; + + entry_type = (iflags & ACE_TYPE_FLAGS); + + if (entry_type == ACE_OWNER) { + if ((access_mask & ACE_READ_DATA) && + (!(seen & S_IRUSR))) { + seen |= S_IRUSR; + if (type == ALLOW) { + mode |= S_IRUSR; + } + } + if ((access_mask & ACE_WRITE_DATA) && + (!(seen & S_IWUSR))) { + seen |= S_IWUSR; + if (type == ALLOW) { + mode |= S_IWUSR; + } + } + if ((access_mask & ACE_EXECUTE) && + (!(seen & S_IXUSR))) { + seen |= S_IXUSR; + if (type == ALLOW) { + mode |= S_IXUSR; + } + } + } else if (entry_type == OWNING_GROUP) { + if ((access_mask & ACE_READ_DATA) && + (!(seen & S_IRGRP))) { + seen |= S_IRGRP; + if (type == ALLOW) { + mode |= S_IRGRP; + } + } + if ((access_mask & ACE_WRITE_DATA) && + (!(seen & S_IWGRP))) { + seen |= S_IWGRP; + if (type == ALLOW) { + mode |= S_IWGRP; + } + } + if ((access_mask & ACE_EXECUTE) && + (!(seen & S_IXGRP))) { + seen |= S_IXGRP; + if (type == ALLOW) { + mode |= S_IXGRP; + } + } + } else if (entry_type == ACE_EVERYONE) { + if ((access_mask & ACE_READ_DATA)) { + if (!(seen & S_IRUSR)) { + seen |= S_IRUSR; + if (type == ALLOW) { + mode |= S_IRUSR; + } + } + if (!(seen & S_IRGRP)) { + seen |= S_IRGRP; + if (type == ALLOW) { + mode |= S_IRGRP; + } + } + if (!(seen & S_IROTH)) { + seen |= S_IROTH; + if (type == ALLOW) { + mode |= S_IROTH; + } + } + } + if ((access_mask & ACE_WRITE_DATA)) { + if (!(seen & S_IWUSR)) { + seen |= S_IWUSR; + if (type == ALLOW) { + mode |= S_IWUSR; + } + } + if (!(seen & S_IWGRP)) { + seen |= S_IWGRP; + if (type == ALLOW) { + mode |= S_IWGRP; + } + } + if (!(seen & S_IWOTH)) { + seen |= S_IWOTH; + if (type == ALLOW) { + mode |= S_IWOTH; + } + } + } + if ((access_mask & ACE_EXECUTE)) { + if (!(seen & S_IXUSR)) { + seen |= S_IXUSR; + if (type == ALLOW) { + mode |= S_IXUSR; + } + } + if (!(seen & S_IXGRP)) { + seen |= S_IXGRP; + if (type == ALLOW) { + mode |= S_IXGRP; + } + } + if (!(seen & S_IXOTH)) { + seen |= S_IXOTH; + if (type == ALLOW) { + mode |= S_IXOTH; + } + } + } + } + /* + * Now handle FUID create for user/group ACEs + */ + if (entry_type == 0 || entry_type == ACE_IDENTIFIER_GROUP) { + aclp->z_ops.ace_who_set(acep, + zfs_fuid_create(zp->z_zfsvfs, who, cr, + (entry_type == 0) ? ZFS_ACE_USER : ZFS_ACE_GROUP, + tx, fuidp)); + } + } + return (mode); +} + +static zfs_acl_t * +zfs_acl_node_read_internal(znode_t *zp, boolean_t will_modify) +{ + zfs_acl_t *aclp; + zfs_acl_node_t *aclnode; + + aclp = zfs_acl_alloc(zp->z_phys->zp_acl.z_acl_version); + + /* + * Version 0 to 1 znode_acl_phys has the size/count fields swapped. + * Version 0 didn't have a size field, only a count. + */ + if (zp->z_phys->zp_acl.z_acl_version == ZFS_ACL_VERSION_INITIAL) { + aclp->z_acl_count = zp->z_phys->zp_acl.z_acl_size; + aclp->z_acl_bytes = ZFS_ACL_SIZE(aclp->z_acl_count); + } else { + aclp->z_acl_count = zp->z_phys->zp_acl.z_acl_count; + aclp->z_acl_bytes = zp->z_phys->zp_acl.z_acl_size; + } + + aclnode = zfs_acl_node_alloc(will_modify ? aclp->z_acl_bytes : 0); + aclnode->z_ace_count = aclp->z_acl_count; + if (will_modify) { + bcopy(zp->z_phys->zp_acl.z_ace_data, aclnode->z_acldata, + aclp->z_acl_bytes); + } else { + aclnode->z_size = aclp->z_acl_bytes; + aclnode->z_acldata = &zp->z_phys->zp_acl.z_ace_data[0]; + } + + list_insert_head(&aclp->z_acl, aclnode); + + return (aclp); +} + +/* + * Read an external acl object. + */ +static int +zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify) +{ + uint64_t extacl = zp->z_phys->zp_acl.z_acl_extern_obj; + zfs_acl_t *aclp; + size_t aclsize; + size_t acl_count; + zfs_acl_node_t *aclnode; + int error; + + ASSERT(MUTEX_HELD(&zp->z_acl_lock)); + + if (zp->z_phys->zp_acl.z_acl_extern_obj == 0) { + *aclpp = zfs_acl_node_read_internal(zp, will_modify); + return (0); + } + + aclp = zfs_acl_alloc(zp->z_phys->zp_acl.z_acl_version); + if (zp->z_phys->zp_acl.z_acl_version == ZFS_ACL_VERSION_INITIAL) { + zfs_acl_phys_v0_t *zacl0 = + (zfs_acl_phys_v0_t *)&zp->z_phys->zp_acl; + + aclsize = ZFS_ACL_SIZE(zacl0->z_acl_count); + acl_count = zacl0->z_acl_count; + } else { + aclsize = zp->z_phys->zp_acl.z_acl_size; + acl_count = zp->z_phys->zp_acl.z_acl_count; + if (aclsize == 0) + aclsize = acl_count * sizeof (zfs_ace_t); + } + aclnode = zfs_acl_node_alloc(aclsize); + list_insert_head(&aclp->z_acl, aclnode); + error = dmu_read(zp->z_zfsvfs->z_os, extacl, 0, + aclsize, aclnode->z_acldata); + aclnode->z_ace_count = acl_count; + aclp->z_acl_count = acl_count; + aclp->z_acl_bytes = aclsize; + + if (error != 0) { + zfs_acl_free(aclp); + return (error); + } + + *aclpp = aclp; + return (0); +} + +/* + * common code for setting ACLs. + * + * This function is called from zfs_mode_update, zfs_perm_init, and zfs_setacl. + * zfs_setacl passes a non-NULL inherit pointer (ihp) to indicate that it's + * already checked the acl and knows whether to inherit. + */ +int +zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, + zfs_fuid_info_t **fuidp, dmu_tx_t *tx) +{ + int error; + znode_phys_t *zphys = zp->z_phys; + zfs_acl_phys_t *zacl = &zphys->zp_acl; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + uint64_t aoid = zphys->zp_acl.z_acl_extern_obj; + uint64_t off = 0; + dmu_object_type_t otype; + zfs_acl_node_t *aclnode; + + ASSERT(MUTEX_HELD(&zp->z_lock)); + ASSERT(MUTEX_HELD(&zp->z_acl_lock)); + + dmu_buf_will_dirty(zp->z_dbuf, tx); + + zphys->zp_mode = zfs_mode_fuid_compute(zp, aclp, cr, fuidp, tx); + + /* + * Decide which opbject type to use. If we are forced to + * use old ACL format than transform ACL into zfs_oldace_t + * layout. + */ + if (!zfsvfs->z_use_fuids) { + otype = DMU_OT_OLDACL; + } else { + if ((aclp->z_version == ZFS_ACL_VERSION_INITIAL) && + (zfsvfs->z_version >= ZPL_VERSION_FUID)) + zfs_acl_xform(zp, aclp); + ASSERT(aclp->z_version >= ZFS_ACL_VERSION_FUID); + otype = DMU_OT_ACL; + } + + if (aclp->z_acl_bytes > ZFS_ACE_SPACE) { + /* + * If ACL was previously external and we are now + * converting to new ACL format then release old + * ACL object and create a new one. + */ + if (aoid && aclp->z_version != zacl->z_acl_version) { + error = dmu_object_free(zfsvfs->z_os, + zp->z_phys->zp_acl.z_acl_extern_obj, tx); + if (error) + return (error); + aoid = 0; + } + if (aoid == 0) { + aoid = dmu_object_alloc(zfsvfs->z_os, + otype, aclp->z_acl_bytes, + otype == DMU_OT_ACL ? DMU_OT_SYSACL : DMU_OT_NONE, + otype == DMU_OT_ACL ? DN_MAX_BONUSLEN : 0, tx); + } else { + (void) dmu_object_set_blocksize(zfsvfs->z_os, aoid, + aclp->z_acl_bytes, 0, tx); + } + zphys->zp_acl.z_acl_extern_obj = aoid; + for (aclnode = list_head(&aclp->z_acl); aclnode; + aclnode = list_next(&aclp->z_acl, aclnode)) { + if (aclnode->z_ace_count == 0) + continue; + dmu_write(zfsvfs->z_os, aoid, off, + aclnode->z_size, aclnode->z_acldata, tx); + off += aclnode->z_size; + } + } else { + void *start = zacl->z_ace_data; + /* + * Migrating back embedded? + */ + if (zphys->zp_acl.z_acl_extern_obj) { + error = dmu_object_free(zfsvfs->z_os, + zp->z_phys->zp_acl.z_acl_extern_obj, tx); + if (error) + return (error); + zphys->zp_acl.z_acl_extern_obj = 0; + } + + for (aclnode = list_head(&aclp->z_acl); aclnode; + aclnode = list_next(&aclp->z_acl, aclnode)) { + if (aclnode->z_ace_count == 0) + continue; + bcopy(aclnode->z_acldata, start, aclnode->z_size); + start = (caddr_t)start + aclnode->z_size; + } + } + + /* + * If Old version then swap count/bytes to match old + * layout of znode_acl_phys_t. + */ + if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) { + zphys->zp_acl.z_acl_size = aclp->z_acl_count; + zphys->zp_acl.z_acl_count = aclp->z_acl_bytes; + } else { + zphys->zp_acl.z_acl_size = aclp->z_acl_bytes; + zphys->zp_acl.z_acl_count = aclp->z_acl_count; + } + + zphys->zp_acl.z_acl_version = aclp->z_version; + + /* + * Replace ACL wide bits, but first clear them. + */ + zp->z_phys->zp_flags &= ~ZFS_ACL_WIDE_FLAGS; + + zp->z_phys->zp_flags |= aclp->z_hints; + + if (ace_trivial_common(aclp, 0, zfs_ace_walk) == 0) + zp->z_phys->zp_flags |= ZFS_ACL_TRIVIAL; + + zfs_time_stamper_locked(zp, STATE_CHANGED, tx); + return (0); +} + +/* + * Update access mask for prepended ACE + * + * This applies the "groupmask" value for aclmode property. + */ +static void +zfs_acl_prepend_fixup(zfs_acl_t *aclp, void *acep, void *origacep, + mode_t mode, uint64_t owner) +{ + int rmask, wmask, xmask; + int user_ace; + uint16_t aceflags; + uint32_t origmask, acepmask; + uint64_t fuid; + + aceflags = aclp->z_ops.ace_flags_get(acep); + fuid = aclp->z_ops.ace_who_get(acep); + origmask = aclp->z_ops.ace_mask_get(origacep); + acepmask = aclp->z_ops.ace_mask_get(acep); + + user_ace = (!(aceflags & + (ACE_OWNER|ACE_GROUP|ACE_IDENTIFIER_GROUP))); + + if (user_ace && (fuid == owner)) { + rmask = S_IRUSR; + wmask = S_IWUSR; + xmask = S_IXUSR; + } else { + rmask = S_IRGRP; + wmask = S_IWGRP; + xmask = S_IXGRP; + } + + if (origmask & ACE_READ_DATA) { + if (mode & rmask) { + acepmask &= ~ACE_READ_DATA; + } else { + acepmask |= ACE_READ_DATA; + } + } + + if (origmask & ACE_WRITE_DATA) { + if (mode & wmask) { + acepmask &= ~ACE_WRITE_DATA; + } else { + acepmask |= ACE_WRITE_DATA; + } + } + + if (origmask & ACE_APPEND_DATA) { + if (mode & wmask) { + acepmask &= ~ACE_APPEND_DATA; + } else { + acepmask |= ACE_APPEND_DATA; + } + } + + if (origmask & ACE_EXECUTE) { + if (mode & xmask) { + acepmask &= ~ACE_EXECUTE; + } else { + acepmask |= ACE_EXECUTE; + } + } + aclp->z_ops.ace_mask_set(acep, acepmask); +} + +/* + * Apply mode to canonical six ACEs. + */ +static void +zfs_acl_fixup_canonical_six(zfs_acl_t *aclp, mode_t mode) +{ + zfs_acl_node_t *aclnode = list_tail(&aclp->z_acl); + void *acep; + int maskoff = aclp->z_ops.ace_mask_off(); + size_t abstract_size = aclp->z_ops.ace_abstract_size(); + + ASSERT(aclnode != NULL); + + acep = (void *)((caddr_t)aclnode->z_acldata + + aclnode->z_size - (abstract_size * 6)); + + /* + * Fixup final ACEs to match the mode + */ + + adjust_ace_pair_common(acep, maskoff, abstract_size, + (mode & 0700) >> 6); /* owner@ */ + + acep = (caddr_t)acep + (abstract_size * 2); + + adjust_ace_pair_common(acep, maskoff, abstract_size, + (mode & 0070) >> 3); /* group@ */ + + acep = (caddr_t)acep + (abstract_size * 2); + adjust_ace_pair_common(acep, maskoff, + abstract_size, mode); /* everyone@ */ +} + + +static int +zfs_acl_ace_match(zfs_acl_t *aclp, void *acep, int allow_deny, + int entry_type, int accessmask) +{ + uint32_t mask = aclp->z_ops.ace_mask_get(acep); + uint16_t type = aclp->z_ops.ace_type_get(acep); + uint16_t flags = aclp->z_ops.ace_flags_get(acep); + + return (mask == accessmask && type == allow_deny && + ((flags & ACE_TYPE_FLAGS) == entry_type)); +} + +/* + * Can prepended ACE be reused? + */ +static int +zfs_reuse_deny(zfs_acl_t *aclp, void *acep, void *prevacep) +{ + int okay_masks; + uint16_t prevtype; + uint16_t prevflags; + uint16_t flags; + uint32_t mask, prevmask; + + if (prevacep == NULL) + return (B_FALSE); + + prevtype = aclp->z_ops.ace_type_get(prevacep); + prevflags = aclp->z_ops.ace_flags_get(prevacep); + flags = aclp->z_ops.ace_flags_get(acep); + mask = aclp->z_ops.ace_mask_get(acep); + prevmask = aclp->z_ops.ace_mask_get(prevacep); + + if (prevtype != DENY) + return (B_FALSE); + + if (prevflags != (flags & ACE_IDENTIFIER_GROUP)) + return (B_FALSE); + + okay_masks = (mask & OKAY_MASK_BITS); + + if (prevmask & ~okay_masks) + return (B_FALSE); + + return (B_TRUE); +} + + +/* + * Insert new ACL node into chain of zfs_acl_node_t's + * + * This will result in two possible results. + * 1. If the ACL is currently just a single zfs_acl_node and + * we are prepending the entry then current acl node will have + * a new node inserted above it. + * + * 2. If we are inserting in the middle of current acl node then + * the current node will be split in two and new node will be inserted + * in between the two split nodes. + */ +static zfs_acl_node_t * +zfs_acl_ace_insert(zfs_acl_t *aclp, void *acep) +{ + zfs_acl_node_t *newnode; + zfs_acl_node_t *trailernode = NULL; + zfs_acl_node_t *currnode = zfs_acl_curr_node(aclp); + int curr_idx = aclp->z_curr_node->z_ace_idx; + int trailer_count; + size_t oldsize; + + newnode = zfs_acl_node_alloc(aclp->z_ops.ace_size(acep)); + newnode->z_ace_count = 1; + + oldsize = currnode->z_size; + + if (curr_idx != 1) { + trailernode = zfs_acl_node_alloc(0); + trailernode->z_acldata = acep; + + trailer_count = currnode->z_ace_count - curr_idx + 1; + currnode->z_ace_count = curr_idx - 1; + currnode->z_size = (caddr_t)acep - (caddr_t)currnode->z_acldata; + trailernode->z_size = oldsize - currnode->z_size; + trailernode->z_ace_count = trailer_count; + } + + aclp->z_acl_count += 1; + aclp->z_acl_bytes += aclp->z_ops.ace_size(acep); + + if (curr_idx == 1) + list_insert_before(&aclp->z_acl, currnode, newnode); + else + list_insert_after(&aclp->z_acl, currnode, newnode); + if (trailernode) { + list_insert_after(&aclp->z_acl, newnode, trailernode); + aclp->z_curr_node = trailernode; + trailernode->z_ace_idx = 1; + } + + return (newnode); +} + +/* + * Prepend deny ACE + */ +static void * +zfs_acl_prepend_deny(znode_t *zp, zfs_acl_t *aclp, void *acep, + mode_t mode) +{ + zfs_acl_node_t *aclnode; + void *newacep; + uint64_t fuid; + uint16_t flags; + + aclnode = zfs_acl_ace_insert(aclp, acep); + newacep = aclnode->z_acldata; + fuid = aclp->z_ops.ace_who_get(acep); + flags = aclp->z_ops.ace_flags_get(acep); + zfs_set_ace(aclp, newacep, 0, DENY, fuid, (flags & ACE_TYPE_FLAGS)); + zfs_acl_prepend_fixup(aclp, newacep, acep, mode, zp->z_phys->zp_uid); + + return (newacep); +} + +/* + * Split an inherited ACE into inherit_only ACE + * and original ACE with inheritance flags stripped off. + */ +static void +zfs_acl_split_ace(zfs_acl_t *aclp, zfs_ace_hdr_t *acep) +{ + zfs_acl_node_t *aclnode; + zfs_acl_node_t *currnode; + void *newacep; + uint16_t type, flags; + uint32_t mask; + uint64_t fuid; + + type = aclp->z_ops.ace_type_get(acep); + flags = aclp->z_ops.ace_flags_get(acep); + mask = aclp->z_ops.ace_mask_get(acep); + fuid = aclp->z_ops.ace_who_get(acep); + + aclnode = zfs_acl_ace_insert(aclp, acep); + newacep = aclnode->z_acldata; + + aclp->z_ops.ace_type_set(newacep, type); + aclp->z_ops.ace_flags_set(newacep, flags | ACE_INHERIT_ONLY_ACE); + aclp->z_ops.ace_mask_set(newacep, mask); + aclp->z_ops.ace_type_set(newacep, type); + aclp->z_ops.ace_who_set(newacep, fuid); + aclp->z_next_ace = acep; + flags &= ~ALL_INHERIT; + aclp->z_ops.ace_flags_set(acep, flags); + currnode = zfs_acl_curr_node(aclp); + ASSERT(currnode->z_ace_idx >= 1); + currnode->z_ace_idx -= 1; +} + +/* + * Are ACES started at index i, the canonical six ACES? + */ +static int +zfs_have_canonical_six(zfs_acl_t *aclp) +{ + void *acep; + zfs_acl_node_t *aclnode = list_tail(&aclp->z_acl); + int i = 0; + size_t abstract_size = aclp->z_ops.ace_abstract_size(); + + ASSERT(aclnode != NULL); + + if (aclnode->z_ace_count < 6) + return (0); + + acep = (void *)((caddr_t)aclnode->z_acldata + + aclnode->z_size - (aclp->z_ops.ace_abstract_size() * 6)); + + if ((zfs_acl_ace_match(aclp, (caddr_t)acep + (abstract_size * i++), + DENY, ACE_OWNER, 0) && + zfs_acl_ace_match(aclp, (caddr_t)acep + (abstract_size * i++), + ALLOW, ACE_OWNER, OWNER_ALLOW_MASK) && + zfs_acl_ace_match(aclp, (caddr_t)acep + (abstract_size * i++), DENY, + OWNING_GROUP, 0) && zfs_acl_ace_match(aclp, (caddr_t)acep + + (abstract_size * i++), + ALLOW, OWNING_GROUP, 0) && + zfs_acl_ace_match(aclp, (caddr_t)acep + (abstract_size * i++), + DENY, ACE_EVERYONE, EVERYONE_DENY_MASK) && + zfs_acl_ace_match(aclp, (caddr_t)acep + (abstract_size * i++), + ALLOW, ACE_EVERYONE, EVERYONE_ALLOW_MASK))) { + return (1); + } else { + return (0); + } +} + + +/* + * Apply step 1g, to group entries + * + * Need to deal with corner case where group may have + * greater permissions than owner. If so then limit + * group permissions, based on what extra permissions + * group has. + */ +static void +zfs_fixup_group_entries(zfs_acl_t *aclp, void *acep, void *prevacep, + mode_t mode) +{ + uint32_t prevmask = aclp->z_ops.ace_mask_get(prevacep); + uint32_t mask = aclp->z_ops.ace_mask_get(acep); + uint16_t prevflags = aclp->z_ops.ace_flags_get(prevacep); + mode_t extramode = (mode >> 3) & 07; + mode_t ownermode = (mode >> 6); + + if (prevflags & ACE_IDENTIFIER_GROUP) { + + extramode &= ~ownermode; + + if (extramode) { + if (extramode & S_IROTH) { + prevmask &= ~ACE_READ_DATA; + mask &= ~ACE_READ_DATA; + } + if (extramode & S_IWOTH) { + prevmask &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA); + mask &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA); + } + if (extramode & S_IXOTH) { + prevmask &= ~ACE_EXECUTE; + mask &= ~ACE_EXECUTE; + } + } + } + aclp->z_ops.ace_mask_set(acep, mask); + aclp->z_ops.ace_mask_set(prevacep, prevmask); +} + +/* + * Apply the chmod algorithm as described + * in PSARC/2002/240 + */ +static void +zfs_acl_chmod(znode_t *zp, uint64_t mode, zfs_acl_t *aclp) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + void *acep = NULL, *prevacep = NULL; + uint64_t who; + int i; + int entry_type; + int reuse_deny; + int need_canonical_six = 1; + uint16_t iflags, type; + uint32_t access_mask; + + ASSERT(MUTEX_HELD(&zp->z_acl_lock)); + ASSERT(MUTEX_HELD(&zp->z_lock)); + + aclp->z_hints = (zp->z_phys->zp_flags & V4_ACL_WIDE_FLAGS); + + /* + * If discard then just discard all ACL nodes which + * represent the ACEs. + * + * New owner@/group@/everone@ ACEs will be added + * later. + */ + if (zfsvfs->z_acl_mode == ZFS_ACL_DISCARD) + zfs_acl_release_nodes(aclp); + + while (acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask, + &iflags, &type)) { + + entry_type = (iflags & ACE_TYPE_FLAGS); + iflags = (iflags & ALL_INHERIT); + + if ((type != ALLOW && type != DENY) || + (iflags & ACE_INHERIT_ONLY_ACE)) { + if (iflags) + aclp->z_hints |= ZFS_INHERIT_ACE; + switch (type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + aclp->z_hints |= ZFS_ACL_OBJ_ACE; + break; + } + goto nextace; + } + + /* + * Need to split ace into two? + */ + if ((iflags & (ACE_FILE_INHERIT_ACE| + ACE_DIRECTORY_INHERIT_ACE)) && + (!(iflags & ACE_INHERIT_ONLY_ACE))) { + zfs_acl_split_ace(aclp, acep); + aclp->z_hints |= ZFS_INHERIT_ACE; + goto nextace; + } + + if (entry_type == ACE_OWNER || entry_type == ACE_EVERYONE || + (entry_type == OWNING_GROUP)) { + access_mask &= ~OGE_CLEAR; + aclp->z_ops.ace_mask_set(acep, access_mask); + goto nextace; + } else { + reuse_deny = B_TRUE; + if (type == ALLOW) { + + /* + * Check preceding ACE if any, to see + * if we need to prepend a DENY ACE. + * This is only applicable when the acl_mode + * property == groupmask. + */ + if (zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK) { + + reuse_deny = zfs_reuse_deny(aclp, acep, + prevacep); + + if (!reuse_deny) { + prevacep = + zfs_acl_prepend_deny(zp, + aclp, acep, mode); + } else { + zfs_acl_prepend_fixup( + aclp, prevacep, + acep, mode, + zp->z_phys->zp_uid); + } + zfs_fixup_group_entries(aclp, acep, + prevacep, mode); + + } + } + } +nextace: + prevacep = acep; + } + + /* + * Check out last six aces, if we have six. + */ + + if (aclp->z_acl_count >= 6) { + if (zfs_have_canonical_six(aclp)) { + need_canonical_six = 0; + } + } + + if (need_canonical_six) { + size_t abstract_size = aclp->z_ops.ace_abstract_size(); + void *zacep; + zfs_acl_node_t *aclnode = + zfs_acl_node_alloc(abstract_size * 6); + + aclnode->z_size = abstract_size * 6; + aclnode->z_ace_count = 6; + aclp->z_acl_bytes += aclnode->z_size; + list_insert_tail(&aclp->z_acl, aclnode); + + zacep = aclnode->z_acldata; + + i = 0; + zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++), + 0, DENY, -1, ACE_OWNER); + zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++), + OWNER_ALLOW_MASK, ALLOW, -1, ACE_OWNER); + zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++), 0, + DENY, -1, OWNING_GROUP); + zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++), 0, + ALLOW, -1, OWNING_GROUP); + zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++), + EVERYONE_DENY_MASK, DENY, -1, ACE_EVERYONE); + zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++), + EVERYONE_ALLOW_MASK, ALLOW, -1, ACE_EVERYONE); + aclp->z_acl_count += 6; + } + + zfs_acl_fixup_canonical_six(aclp, mode); +} + +int +zfs_acl_chmod_setattr(znode_t *zp, zfs_acl_t **aclp, uint64_t mode) +{ + int error; + + mutex_enter(&zp->z_lock); + mutex_enter(&zp->z_acl_lock); + *aclp = NULL; + error = zfs_acl_node_read(zp, aclp, B_TRUE); + if (error == 0) + zfs_acl_chmod(zp, mode, *aclp); + mutex_exit(&zp->z_acl_lock); + mutex_exit(&zp->z_lock); + return (error); +} + +/* + * strip off write_owner and write_acl + */ +static void +zfs_restricted_update(zfsvfs_t *zfsvfs, zfs_acl_t *aclp, void *acep) +{ + uint32_t mask = aclp->z_ops.ace_mask_get(acep); + + if ((zfsvfs->z_acl_inherit == ZFS_ACL_RESTRICTED) && + (aclp->z_ops.ace_type_get(acep) == ALLOW)) { + mask &= ~RESTRICTED_CLEAR; + aclp->z_ops.ace_mask_set(acep, mask); + } +} + +/* + * Should ACE be inherited? + */ +static int +zfs_ace_can_use(znode_t *zp, uint16_t acep_flags) +{ + int vtype = ZTOV(zp)->v_type; + int iflags = (acep_flags & 0xf); + + if ((vtype == VDIR) && (iflags & ACE_DIRECTORY_INHERIT_ACE)) + return (1); + else if (iflags & ACE_FILE_INHERIT_ACE) + return (!((vtype == VDIR) && + (iflags & ACE_NO_PROPAGATE_INHERIT_ACE))); + return (0); +} + +/* + * inherit inheritable ACEs from parent + */ +static zfs_acl_t * +zfs_acl_inherit(znode_t *zp, zfs_acl_t *paclp, boolean_t *need_chmod) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + void *pacep; + void *acep, *acep2; + zfs_acl_node_t *aclnode, *aclnode2; + zfs_acl_t *aclp = NULL; + uint64_t who; + uint32_t access_mask; + uint16_t iflags, newflags, type; + size_t ace_size; + void *data1, *data2; + size_t data1sz, data2sz; + enum vtype vntype = ZTOV(zp)->v_type; + + *need_chmod = B_TRUE; + pacep = NULL; + aclp = zfs_acl_alloc(zfs_acl_version_zp(zp)); + if (zfsvfs->z_acl_inherit != ZFS_ACL_DISCARD) { + while (pacep = zfs_acl_next_ace(paclp, pacep, &who, + &access_mask, &iflags, &type)) { + + if (zfsvfs->z_acl_inherit == ZFS_ACL_NOALLOW && + type == ALLOW) + continue; + + ace_size = aclp->z_ops.ace_size(pacep); + + if (!zfs_ace_can_use(zp, iflags)) + continue; + + /* + * If owner@, group@, or everyone@ inheritable + * then zfs_acl_chmod() isn't needed. + */ + if (zfsvfs->z_acl_inherit == + ZFS_ACL_PASSTHROUGH && + ((iflags & (ACE_OWNER|ACE_EVERYONE)) || + ((iflags & OWNING_GROUP) == + OWNING_GROUP)) && (vntype == VREG || + (vntype == VDIR && + (iflags & ACE_DIRECTORY_INHERIT_ACE)))) + *need_chmod = B_FALSE; + + aclnode = zfs_acl_node_alloc(ace_size); + list_insert_tail(&aclp->z_acl, aclnode); + acep = aclnode->z_acldata; + zfs_set_ace(aclp, acep, access_mask, type, + who, iflags|ACE_INHERITED_ACE); + + /* + * Copy special opaque data if any + */ + if ((data1sz = paclp->z_ops.ace_data(pacep, + &data1)) != 0) { + VERIFY((data2sz = aclp->z_ops.ace_data(acep, + &data2)) == data1sz); + bcopy(data1, data2, data2sz); + } + aclp->z_acl_count++; + aclnode->z_ace_count++; + aclp->z_acl_bytes += aclnode->z_size; + newflags = aclp->z_ops.ace_flags_get(acep); + + if (vntype == VDIR) + aclp->z_hints |= ZFS_INHERIT_ACE; + + if ((iflags & ACE_NO_PROPAGATE_INHERIT_ACE) || + (vntype != VDIR)) { + newflags &= ~ALL_INHERIT; + aclp->z_ops.ace_flags_set(acep, + newflags|ACE_INHERITED_ACE); + zfs_restricted_update(zfsvfs, aclp, acep); + continue; + } + + ASSERT(vntype == VDIR); + + newflags = aclp->z_ops.ace_flags_get(acep); + if ((iflags & (ACE_FILE_INHERIT_ACE | + ACE_DIRECTORY_INHERIT_ACE)) != + ACE_FILE_INHERIT_ACE) { + aclnode2 = zfs_acl_node_alloc(ace_size); + list_insert_tail(&aclp->z_acl, aclnode2); + acep2 = aclnode2->z_acldata; + zfs_set_ace(aclp, acep2, + access_mask, type, who, + iflags|ACE_INHERITED_ACE); + newflags |= ACE_INHERIT_ONLY_ACE; + aclp->z_ops.ace_flags_set(acep, newflags); + newflags &= ~ALL_INHERIT; + aclp->z_ops.ace_flags_set(acep2, + newflags|ACE_INHERITED_ACE); + + /* + * Copy special opaque data if any + */ + if ((data1sz = aclp->z_ops.ace_data(acep, + &data1)) != 0) { + VERIFY((data2sz = + aclp->z_ops.ace_data(acep2, + &data2)) == data1sz); + bcopy(data1, data2, data1sz); + } + aclp->z_acl_count++; + aclnode2->z_ace_count++; + aclp->z_acl_bytes += aclnode->z_size; + zfs_restricted_update(zfsvfs, aclp, acep2); + } else { + newflags |= ACE_INHERIT_ONLY_ACE; + aclp->z_ops.ace_flags_set(acep, + newflags|ACE_INHERITED_ACE); + } + } + } + return (aclp); +} + +/* + * Create file system object initial permissions + * including inheritable ACEs. + */ +void +zfs_perm_init(znode_t *zp, znode_t *parent, int flag, + vattr_t *vap, dmu_tx_t *tx, cred_t *cr, + zfs_acl_t *setaclp, zfs_fuid_info_t **fuidp) +{ + uint64_t mode, fuid, fgid; + int error; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zfs_acl_t *aclp = NULL; + zfs_acl_t *paclp; + xvattr_t *xvap = (xvattr_t *)vap; + gid_t gid; + boolean_t need_chmod = B_TRUE; + + if (setaclp) + aclp = setaclp; + + mode = MAKEIMODE(vap->va_type, vap->va_mode); + + /* + * Determine uid and gid. + */ + if ((flag & (IS_ROOT_NODE | IS_REPLAY)) || + ((flag & IS_XATTR) && (vap->va_type == VDIR))) { + fuid = zfs_fuid_create(zfsvfs, vap->va_uid, cr, + ZFS_OWNER, tx, fuidp); + fgid = zfs_fuid_create(zfsvfs, vap->va_gid, cr, + ZFS_GROUP, tx, fuidp); + gid = vap->va_gid; + } else { + fuid = zfs_fuid_create_cred(zfsvfs, ZFS_OWNER, tx, cr, fuidp); + fgid = 0; + if (vap->va_mask & AT_GID) { + fgid = zfs_fuid_create(zfsvfs, vap->va_gid, cr, + ZFS_GROUP, tx, fuidp); + gid = vap->va_gid; + if (fgid != parent->z_phys->zp_gid && + !groupmember(vap->va_gid, cr) && + secpolicy_vnode_create_gid(cr) != 0) + fgid = 0; + } + if (fgid == 0) { + if (parent->z_phys->zp_mode & S_ISGID) { + fgid = parent->z_phys->zp_gid; + gid = zfs_fuid_map_id(zfsvfs, fgid, + cr, ZFS_GROUP); + } else { + fgid = zfs_fuid_create_cred(zfsvfs, + ZFS_GROUP, tx, cr, fuidp); + gid = crgetgid(cr); + } + } + } + + /* + * If we're creating a directory, and the parent directory has the + * set-GID bit set, set in on the new directory. + * Otherwise, if the user is neither privileged nor a member of the + * file's new group, clear the file's set-GID bit. + */ + + if ((parent->z_phys->zp_mode & S_ISGID) && (vap->va_type == VDIR)) { + mode |= S_ISGID; + } else { + if ((mode & S_ISGID) && + secpolicy_vnode_setids_setgids(cr, gid) != 0) + mode &= ~S_ISGID; + } + + zp->z_phys->zp_uid = fuid; + zp->z_phys->zp_gid = fgid; + zp->z_phys->zp_mode = mode; + + if (aclp == NULL) { + mutex_enter(&parent->z_lock); + if (parent->z_phys->zp_flags & ZFS_INHERIT_ACE) { + mutex_enter(&parent->z_acl_lock); + VERIFY(0 == zfs_acl_node_read(parent, &paclp, B_FALSE)); + mutex_exit(&parent->z_acl_lock); + aclp = zfs_acl_inherit(zp, paclp, &need_chmod); + zfs_acl_free(paclp); + } else { + aclp = zfs_acl_alloc(zfs_acl_version_zp(zp)); + } + mutex_exit(&parent->z_lock); + mutex_enter(&zp->z_lock); + mutex_enter(&zp->z_acl_lock); + if (need_chmod) + zfs_acl_chmod(zp, mode, aclp); + } else { + mutex_enter(&zp->z_lock); + mutex_enter(&zp->z_acl_lock); + } + + /* Force auto_inherit on all new directory objects */ + if (vap->va_type == VDIR) + aclp->z_hints |= ZFS_ACL_AUTO_INHERIT; + + error = zfs_aclset_common(zp, aclp, cr, fuidp, tx); + + /* Set optional attributes if any */ + if (vap->va_mask & AT_XVATTR) + zfs_xvattr_set(zp, xvap); + + mutex_exit(&zp->z_lock); + mutex_exit(&zp->z_acl_lock); + ASSERT3U(error, ==, 0); + + if (aclp != setaclp) + zfs_acl_free(aclp); +} + +/* + * Retrieve a files ACL + */ +int +zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) +{ + zfs_acl_t *aclp; + ulong_t mask; + int error; + int count = 0; + int largeace = 0; + + mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT | + VSA_ACE_ACLFLAGS | VSA_ACE_ALLTYPES); + + if (error = zfs_zaccess(zp, ACE_READ_ACL, 0, skipaclchk, cr)) + return (error); + + if (mask == 0) + return (ENOSYS); + + mutex_enter(&zp->z_acl_lock); + + error = zfs_acl_node_read(zp, &aclp, B_FALSE); + if (error != 0) { + mutex_exit(&zp->z_acl_lock); + return (error); + } + + /* + * Scan ACL to determine number of ACEs + */ + if ((zp->z_phys->zp_flags & ZFS_ACL_OBJ_ACE) && + !(mask & VSA_ACE_ALLTYPES)) { + void *zacep = NULL; + uint64_t who; + uint32_t access_mask; + uint16_t type, iflags; + + while (zacep = zfs_acl_next_ace(aclp, zacep, + &who, &access_mask, &iflags, &type)) { + switch (type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + largeace++; + continue; + default: + count++; + } + } + vsecp->vsa_aclcnt = count; + } else + count = aclp->z_acl_count; + + if (mask & VSA_ACECNT) { + vsecp->vsa_aclcnt = count; + } + + if (mask & VSA_ACE) { + size_t aclsz; + + zfs_acl_node_t *aclnode = list_head(&aclp->z_acl); + + aclsz = count * sizeof (ace_t) + + sizeof (ace_object_t) * largeace; + + vsecp->vsa_aclentp = kmem_alloc(aclsz, KM_SLEEP); + vsecp->vsa_aclentsz = aclsz; + + if (aclp->z_version == ZFS_ACL_VERSION_FUID) + zfs_copy_fuid_2_ace(zp->z_zfsvfs, aclp, cr, + vsecp->vsa_aclentp, !(mask & VSA_ACE_ALLTYPES)); + else { + bcopy(aclnode->z_acldata, vsecp->vsa_aclentp, + count * sizeof (ace_t)); + } + } + if (mask & VSA_ACE_ACLFLAGS) { + vsecp->vsa_aclflags = 0; + if (zp->z_phys->zp_flags & ZFS_ACL_DEFAULTED) + vsecp->vsa_aclflags |= ACL_DEFAULTED; + if (zp->z_phys->zp_flags & ZFS_ACL_PROTECTED) + vsecp->vsa_aclflags |= ACL_PROTECTED; + if (zp->z_phys->zp_flags & ZFS_ACL_AUTO_INHERIT) + vsecp->vsa_aclflags |= ACL_AUTO_INHERIT; + } + + mutex_exit(&zp->z_acl_lock); + + zfs_acl_free(aclp); + + return (0); +} + +int +zfs_vsec_2_aclp(zfsvfs_t *zfsvfs, vtype_t obj_type, + vsecattr_t *vsecp, zfs_acl_t **zaclp) +{ + zfs_acl_t *aclp; + zfs_acl_node_t *aclnode; + int aclcnt = vsecp->vsa_aclcnt; + int error; + + if (vsecp->vsa_aclcnt > MAX_ACL_ENTRIES || vsecp->vsa_aclcnt <= 0) + return (EINVAL); + + aclp = zfs_acl_alloc(zfs_acl_version(zfsvfs->z_version)); + + aclp->z_hints = 0; + aclnode = zfs_acl_node_alloc(aclcnt * sizeof (zfs_object_ace_t)); + if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) { + if ((error = zfs_copy_ace_2_oldace(obj_type, aclp, + (ace_t *)vsecp->vsa_aclentp, aclnode->z_acldata, + aclcnt, &aclnode->z_size)) != 0) { + zfs_acl_free(aclp); + zfs_acl_node_free(aclnode); + return (error); + } + } else { + if ((error = zfs_copy_ace_2_fuid(obj_type, aclp, + vsecp->vsa_aclentp, aclnode->z_acldata, aclcnt, + &aclnode->z_size)) != 0) { + zfs_acl_free(aclp); + zfs_acl_node_free(aclnode); + return (error); + } + } + aclp->z_acl_bytes = aclnode->z_size; + aclnode->z_ace_count = aclcnt; + aclp->z_acl_count = aclcnt; + list_insert_head(&aclp->z_acl, aclnode); + + /* + * If flags are being set then add them to z_hints + */ + if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS) { + if (vsecp->vsa_aclflags & ACL_PROTECTED) + aclp->z_hints |= ZFS_ACL_PROTECTED; + if (vsecp->vsa_aclflags & ACL_DEFAULTED) + aclp->z_hints |= ZFS_ACL_DEFAULTED; + if (vsecp->vsa_aclflags & ACL_AUTO_INHERIT) + aclp->z_hints |= ZFS_ACL_AUTO_INHERIT; + } + + *zaclp = aclp; + + return (0); +} + +/* + * Set a files ACL + */ +int +zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zilog_t *zilog = zfsvfs->z_log; + ulong_t mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT); + dmu_tx_t *tx; + int error; + zfs_acl_t *aclp; + zfs_fuid_info_t *fuidp = NULL; + + if (mask == 0) + return (ENOSYS); + + if (zp->z_phys->zp_flags & ZFS_IMMUTABLE) + return (EPERM); + + if (error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr)) + return (error); + + error = zfs_vsec_2_aclp(zfsvfs, ZTOV(zp)->v_type, vsecp, &aclp); + if (error) + return (error); + + /* + * If ACL wide flags aren't being set then preserve any + * existing flags. + */ + if (!(vsecp->vsa_mask & VSA_ACE_ACLFLAGS)) { + aclp->z_hints |= (zp->z_phys->zp_flags & V4_ACL_WIDE_FLAGS); + } +top: + if (error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr)) { + zfs_acl_free(aclp); + return (error); + } + + mutex_enter(&zp->z_lock); + mutex_enter(&zp->z_acl_lock); + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_bonus(tx, zp->z_id); + + if (zp->z_phys->zp_acl.z_acl_extern_obj) { + /* Are we upgrading ACL? */ + if (zfsvfs->z_version <= ZPL_VERSION_FUID && + zp->z_phys->zp_acl.z_acl_version == + ZFS_ACL_VERSION_INITIAL) { + dmu_tx_hold_free(tx, + zp->z_phys->zp_acl.z_acl_extern_obj, + 0, DMU_OBJECT_END); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, aclp->z_acl_bytes); + } else { + dmu_tx_hold_write(tx, + zp->z_phys->zp_acl.z_acl_extern_obj, + 0, aclp->z_acl_bytes); + } + } else if (aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); + } + if (aclp->z_has_fuids) { + if (zfsvfs->z_fuid_obj == 0) { + dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + FUID_SIZE_ESTIMATE(zfsvfs)); + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); + } else { + dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); + dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, + FUID_SIZE_ESTIMATE(zfsvfs)); + } + } + + error = dmu_tx_assign(tx, zfsvfs->z_assign); + if (error) { + mutex_exit(&zp->z_acl_lock); + mutex_exit(&zp->z_lock); + + if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + dmu_tx_abort(tx); + zfs_acl_free(aclp); + return (error); + } + + error = zfs_aclset_common(zp, aclp, cr, &fuidp, tx); + ASSERT(error == 0); + + zfs_log_acl(zilog, tx, zp, vsecp, fuidp); + + if (fuidp) + zfs_fuid_info_free(fuidp); + zfs_acl_free(aclp); + dmu_tx_commit(tx); +done: + mutex_exit(&zp->z_acl_lock); + mutex_exit(&zp->z_lock); + + return (error); +} + +/* + * working_mode returns the permissions that were not granted + */ +static int +zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode, + boolean_t *check_privs, boolean_t skipaclchk, cred_t *cr) +{ + zfs_acl_t *aclp; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error; + uid_t uid = crgetuid(cr); + uint64_t who; + uint16_t type, iflags; + uint16_t entry_type; + uint32_t access_mask; + uint32_t deny_mask = 0; + zfs_ace_hdr_t *acep = NULL; + boolean_t checkit; + uid_t fowner; + uid_t gowner; + + /* + * Short circuit empty requests + */ + if (v4_mode == 0) + return (0); + + *check_privs = B_TRUE; + + if (zfsvfs->z_assign >= TXG_INITIAL) { /* ZIL replay */ + *working_mode = 0; + return (0); + } + + *working_mode = v4_mode; + + if ((v4_mode & WRITE_MASK) && + (zp->z_zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) && + (!IS_DEVVP(ZTOV(zp)))) { + *check_privs = B_FALSE; + return (EROFS); + } + + /* + * Only check for READONLY on non-directories. + */ + if ((v4_mode & WRITE_MASK_DATA) && + (((ZTOV(zp)->v_type != VDIR) && + (zp->z_phys->zp_flags & (ZFS_READONLY | ZFS_IMMUTABLE))) || + (ZTOV(zp)->v_type == VDIR && + (zp->z_phys->zp_flags & ZFS_IMMUTABLE)))) { + *check_privs = B_FALSE; + return (EPERM); + } + + if ((v4_mode & (ACE_DELETE | ACE_DELETE_CHILD)) && + (zp->z_phys->zp_flags & ZFS_NOUNLINK)) { + *check_privs = B_FALSE; + return (EPERM); + } + + if (((v4_mode & (ACE_READ_DATA|ACE_EXECUTE)) && + (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED))) { + *check_privs = B_FALSE; + return (EACCES); + } + + /* + * The caller requested that the ACL check be skipped. This + * would only happen if the caller checked VOP_ACCESS() with a + * 32 bit ACE mask and already had the appropriate permissions. + */ + if (skipaclchk) { + *working_mode = 0; + return (0); + } + + zfs_fuid_map_ids(zp, cr, &fowner, &gowner); + + mutex_enter(&zp->z_acl_lock); + + error = zfs_acl_node_read(zp, &aclp, B_FALSE); + if (error != 0) { + mutex_exit(&zp->z_acl_lock); + return (error); + } + + while (acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask, + &iflags, &type)) { + + if (iflags & ACE_INHERIT_ONLY_ACE) + continue; + + entry_type = (iflags & ACE_TYPE_FLAGS); + + checkit = B_FALSE; + + switch (entry_type) { + case ACE_OWNER: + if (uid == fowner) + checkit = B_TRUE; + break; + case OWNING_GROUP: + who = gowner; + /*FALLTHROUGH*/ + case ACE_IDENTIFIER_GROUP: + checkit = zfs_groupmember(zfsvfs, who, cr); + break; + case ACE_EVERYONE: + checkit = B_TRUE; + break; + + /* USER Entry */ + default: + if (entry_type == 0) { + uid_t newid; + + newid = zfs_fuid_map_id(zfsvfs, who, cr, + ZFS_ACE_USER); + if (newid != IDMAP_WK_CREATOR_OWNER_UID && + uid == newid) + checkit = B_TRUE; + break; + } else { + zfs_acl_free(aclp); + mutex_exit(&zp->z_acl_lock); + return (EIO); + } + } + + if (checkit) { + uint32_t mask_matched = (access_mask & *working_mode); + + if (mask_matched) { + if (type == DENY) + deny_mask |= mask_matched; + + *working_mode &= ~mask_matched; + } + } + + /* Are we done? */ + if (*working_mode == 0) + break; + } + + mutex_exit(&zp->z_acl_lock); + zfs_acl_free(aclp); + + /* Put the found 'denies' back on the working mode */ + *working_mode |= deny_mask; + + if (*working_mode) + return (EACCES); + + return (0); +} + +static int +zfs_zaccess_append(znode_t *zp, uint32_t *working_mode, boolean_t *check_privs, + cred_t *cr) +{ + if (*working_mode != ACE_WRITE_DATA) + return (EACCES); + + return (zfs_zaccess_common(zp, ACE_APPEND_DATA, working_mode, + check_privs, B_FALSE, cr)); +} + +/* + * Determine whether Access should be granted/denied, invoking least + * priv subsytem when a deny is determined. + */ +int +zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) +{ + uint32_t working_mode; + int error; + int is_attr; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + boolean_t check_privs; + znode_t *xzp; + znode_t *check_zp = zp; + + is_attr = ((zp->z_phys->zp_flags & ZFS_XATTR) && + (ZTOV(zp)->v_type == VDIR)); + + /* + * If attribute then validate against base file + */ + if (is_attr) { + if ((error = zfs_zget(zp->z_zfsvfs, + zp->z_phys->zp_parent, &xzp)) != 0) { + return (error); + } + + check_zp = xzp; + + /* + * fixup mode to map to xattr perms + */ + + if (mode & (ACE_WRITE_DATA|ACE_APPEND_DATA)) { + mode &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA); + mode |= ACE_WRITE_NAMED_ATTRS; + } + + if (mode & (ACE_READ_DATA|ACE_EXECUTE)) { + mode &= ~(ACE_READ_DATA|ACE_EXECUTE); + mode |= ACE_READ_NAMED_ATTRS; + } + } + + if ((error = zfs_zaccess_common(check_zp, mode, &working_mode, + &check_privs, skipaclchk, cr)) == 0) { + if (is_attr) + VN_RELE(ZTOV(xzp)); + return (0); + } + + if (error && !check_privs) { + if (is_attr) + VN_RELE(ZTOV(xzp)); + return (error); + } + + if (error && (flags & V_APPEND)) { + error = zfs_zaccess_append(zp, &working_mode, &check_privs, cr); + } + + if (error && check_privs) { + uid_t owner; + mode_t checkmode = 0; + + owner = zfs_fuid_map_id(zfsvfs, check_zp->z_phys->zp_uid, cr, + ZFS_OWNER); + + /* + * First check for implicit owner permission on + * read_acl/read_attributes + */ + + error = 0; + ASSERT(working_mode != 0); + + if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES) && + owner == crgetuid(cr))) + working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES); + + if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS| + ACE_READ_ACL|ACE_READ_ATTRIBUTES)) + checkmode |= VREAD; + if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS| + ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES)) + checkmode |= VWRITE; + if (working_mode & ACE_EXECUTE) + checkmode |= VEXEC; + + if (checkmode) + error = secpolicy_vnode_access(cr, ZTOV(check_zp), + owner, checkmode); + + if (error == 0 && (working_mode & ACE_WRITE_OWNER)) + error = secpolicy_vnode_create_gid(cr); + if (error == 0 && (working_mode & ACE_WRITE_ACL)) + error = secpolicy_vnode_setdac(cr, owner); + + if (error == 0 && (working_mode & + (ACE_DELETE|ACE_DELETE_CHILD))) + error = secpolicy_vnode_remove(cr); + + if (error == 0 && (working_mode & ACE_SYNCHRONIZE)) + error = secpolicy_vnode_owner(cr, owner); + + if (error == 0) { + /* + * See if any bits other than those already checked + * for are still present. If so then return EACCES + */ + if (working_mode & ~(ZFS_CHECKED_MASKS)) { + error = EACCES; + } + } + } + + if (is_attr) + VN_RELE(ZTOV(xzp)); + + return (error); +} + +/* + * Translate traditional unix VREAD/VWRITE/VEXEC mode into + * native ACL format and call zfs_zaccess() + */ +int +zfs_zaccess_rwx(znode_t *zp, mode_t mode, int flags, cred_t *cr) +{ + return (zfs_zaccess(zp, zfs_unix_to_v4(mode >> 6), flags, B_FALSE, cr)); +} + +/* + * Access function for secpolicy_vnode_setattr + */ +int +zfs_zaccess_unix(znode_t *zp, mode_t mode, cred_t *cr) +{ + int v4_mode = zfs_unix_to_v4(mode >> 6); + + return (zfs_zaccess(zp, v4_mode, 0, B_FALSE, cr)); +} + +static int +zfs_delete_final_check(znode_t *zp, znode_t *dzp, + mode_t missing_perms, cred_t *cr) +{ + int error; + uid_t downer; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + downer = zfs_fuid_map_id(zfsvfs, dzp->z_phys->zp_uid, cr, ZFS_OWNER); + + error = secpolicy_vnode_access(cr, ZTOV(dzp), downer, missing_perms); + + if (error == 0) + error = zfs_sticky_remove_access(dzp, zp, cr); + + return (error); +} + +/* + * Determine whether Access should be granted/deny, without + * consulting least priv subsystem. + * + * + * The following chart is the recommended NFSv4 enforcement for + * ability to delete an object. + * + * ------------------------------------------------------- + * | Parent Dir | Target Object Permissions | + * | permissions | | + * ------------------------------------------------------- + * | | ACL Allows | ACL Denies| Delete | + * | | Delete | Delete | unspecified| + * ------------------------------------------------------- + * | ACL Allows | Permit | Permit | Permit | + * | DELETE_CHILD | | + * ------------------------------------------------------- + * | ACL Denies | Permit | Deny | Deny | + * | DELETE_CHILD | | | | + * ------------------------------------------------------- + * | ACL specifies | | | | + * | only allow | Permit | Permit | Permit | + * | write and | | | | + * | execute | | | | + * ------------------------------------------------------- + * | ACL denies | | | | + * | write and | Permit | Deny | Deny | + * | execute | | | | + * ------------------------------------------------------- + * ^ + * | + * No search privilege, can't even look up file? + * + */ +int +zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr) +{ + uint32_t dzp_working_mode = 0; + uint32_t zp_working_mode = 0; + int dzp_error, zp_error; + mode_t missing_perms; + boolean_t dzpcheck_privs = B_TRUE; + boolean_t zpcheck_privs = B_TRUE; + + /* + * We want specific DELETE permissions to + * take precedence over WRITE/EXECUTE. We don't + * want an ACL such as this to mess us up. + * user:joe:write_data:deny,user:joe:delete:allow + * + * However, deny permissions may ultimately be overridden + * by secpolicy_vnode_access(). + * + * We will ask for all of the necessary permissions and then + * look at the working modes from the directory and target object + * to determine what was found. + */ + + if (zp->z_phys->zp_flags & (ZFS_IMMUTABLE | ZFS_NOUNLINK)) + return (EPERM); + + /* + * If the directory permissions allow the delete, we are done. + */ + if ((dzp_error = zfs_zaccess_common(dzp, + ACE_DELETE_CHILD|ACE_EXECUTE|ACE_WRITE_DATA, + &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr)) == 0) + return (0); + + /* + * If target object has delete permission then we are done + */ + if ((zp_error = zfs_zaccess_common(zp, ACE_DELETE, &zp_working_mode, + &zpcheck_privs, B_FALSE, cr)) == 0) + return (0); + + if (!dzpcheck_privs) + return (dzp_error); + else if (!zpcheck_privs) + return (zp_error); + + /* + * First check the first row. + * We only need to see if parent Allows delete_child + */ + if ((dzp_working_mode & ACE_DELETE_CHILD) == 0) + return (0); + + /* + * Second row + * we already have the necessary information in + * zp_working_mode, zp_error and dzp_error. + */ + + if ((zp_working_mode & ACE_DELETE) == 0) + return (0); + + /* + * determine the needed permissions based off of the directories + * working mode + */ + + missing_perms = (dzp_working_mode & ACE_WRITE_DATA) ? VWRITE : 0; + missing_perms |= (dzp_working_mode & ACE_EXECUTE) ? VEXEC : 0; + + if (dzp_error == EACCES) + return (zfs_delete_final_check(zp, dzp, missing_perms, cr)); + + /* + * Third Row + * only need to see if we have write/execute on directory. + */ + + if (missing_perms == 0) + return (zfs_sticky_remove_access(dzp, zp, cr)); + + /* + * Fourth Row + */ + + if (missing_perms && ((zp_working_mode & ACE_DELETE) == 0)) + return (zfs_sticky_remove_access(dzp, zp, cr)); + + return (zfs_delete_final_check(zp, dzp, missing_perms, cr)); +} + +int +zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp, + znode_t *tzp, cred_t *cr) +{ + int add_perm; + int error; + + if (szp->z_phys->zp_flags & ZFS_AV_QUARANTINED) + return (EACCES); + + add_perm = (ZTOV(szp)->v_type == VDIR) ? + ACE_ADD_SUBDIRECTORY : ACE_ADD_FILE; + + /* + * Rename permissions are combination of delete permission + + * add file/subdir permission. + */ + + /* + * first make sure we do the delete portion. + * + * If that succeeds then check for add_file/add_subdir permissions + */ + + if (error = zfs_zaccess_delete(sdzp, szp, cr)) + return (error); + + /* + * If we have a tzp, see if we can delete it? + */ + if (tzp) { + if (error = zfs_zaccess_delete(tdzp, tzp, cr)) + return (error); + } + + /* + * Now check for add permissions + */ + error = zfs_zaccess(tdzp, add_perm, 0, B_FALSE, cr); + + return (error); +} diff --git a/zfs/lib/libdmu-ctl/zfs_ctldir.c b/zfs/lib/libdmu-ctl/zfs_ctldir.c new file mode 100644 index 0000000000..45de481c9f --- /dev/null +++ b/zfs/lib/libdmu-ctl/zfs_ctldir.c @@ -0,0 +1,1147 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "@(#)zfs_ctldir.c 1.20 08/04/27 SMI" + +/* + * ZFS control directory (a.k.a. ".zfs") + * + * This directory provides a common location for all ZFS meta-objects. + * Currently, this is only the 'snapshot' directory, but this may expand in the + * future. The elements are built using the GFS primitives, as the hierarchy + * does not actually exist on disk. + * + * For 'snapshot', we don't want to have all snapshots always mounted, because + * this would take up a huge amount of space in /etc/mnttab. We have three + * types of objects: + * + * ctldir ------> snapshotdir -------> snapshot + * | + * | + * V + * mounted fs + * + * The 'snapshot' node contains just enough information to lookup '..' and act + * as a mountpoint for the snapshot. Whenever we lookup a specific snapshot, we + * perform an automount of the underlying filesystem and return the + * corresponding vnode. + * + * All mounts are handled automatically by the kernel, but unmounts are + * (currently) handled from user land. The main reason is that there is no + * reliable way to auto-unmount the filesystem when it's "no longer in use". + * When the user unmounts a filesystem, we call zfsctl_unmount(), which + * unmounts any snapshots within the snapshot directory. + * + * The '.zfs', '.zfs/snapshot', and all directories created under + * '.zfs/snapshot' (ie: '.zfs/snapshot/') are all GFS nodes and + * share the same vfs_t as the head filesystem (what '.zfs' lives under). + * + * File systems mounted ontop of the GFS nodes '.zfs/snapshot/' + * (ie: snapshots) are ZFS nodes and have their own unique vfs_t. + * However, vnodes within these mounted on file systems have their v_vfsp + * fields set to the head filesystem to make NFS happy (see + * zfsctl_snapdir_lookup()). We VFS_HOLD the head filesystem's vfs_t + * so that it cannot be freed until all snapshots have been unmounted. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +typedef struct zfsctl_node { + gfs_dir_t zc_gfs_private; + uint64_t zc_id; + timestruc_t zc_cmtime; /* ctime and mtime, always the same */ +} zfsctl_node_t; + +typedef struct zfsctl_snapdir { + zfsctl_node_t sd_node; + kmutex_t sd_lock; + avl_tree_t sd_snaps; +} zfsctl_snapdir_t; + +typedef struct { + char *se_name; + vnode_t *se_root; + avl_node_t se_node; +} zfs_snapentry_t; + +static int +snapentry_compare(const void *a, const void *b) +{ + const zfs_snapentry_t *sa = a; + const zfs_snapentry_t *sb = b; + int ret = strcmp(sa->se_name, sb->se_name); + + if (ret < 0) + return (-1); + else if (ret > 0) + return (1); + else + return (0); +} + +vnodeops_t *zfsctl_ops_root; +vnodeops_t *zfsctl_ops_snapdir; +vnodeops_t *zfsctl_ops_snapshot; + +static const fs_operation_def_t zfsctl_tops_root[]; +static const fs_operation_def_t zfsctl_tops_snapdir[]; +static const fs_operation_def_t zfsctl_tops_snapshot[]; + +static vnode_t *zfsctl_mknode_snapdir(vnode_t *); +static vnode_t *zfsctl_snapshot_mknode(vnode_t *, uint64_t objset); +static int zfsctl_unmount_snap(zfs_snapentry_t *, int, cred_t *); + +static gfs_opsvec_t zfsctl_opsvec[] = { + { ".zfs", zfsctl_tops_root, &zfsctl_ops_root }, + { ".zfs/snapshot", zfsctl_tops_snapdir, &zfsctl_ops_snapdir }, + { ".zfs/snapshot/vnode", zfsctl_tops_snapshot, &zfsctl_ops_snapshot }, + { NULL } +}; + +/* + * Root directory elements. We have only a single static entry, 'snapshot'. + */ +static gfs_dirent_t zfsctl_root_entries[] = { + { "snapshot", zfsctl_mknode_snapdir, GFS_CACHE_VNODE }, + { NULL } +}; + +/* include . and .. in the calculation */ +#define NROOT_ENTRIES ((sizeof (zfsctl_root_entries) / \ + sizeof (gfs_dirent_t)) + 1) + + +/* + * Initialize the various GFS pieces we'll need to create and manipulate .zfs + * directories. This is called from the ZFS init routine, and initializes the + * vnode ops vectors that we'll be using. + */ +void +zfsctl_init(void) +{ + VERIFY(gfs_make_opsvec(zfsctl_opsvec) == 0); +} + +void +zfsctl_fini(void) +{ + /* + * Remove vfsctl vnode ops + */ + if (zfsctl_ops_root) + vn_freevnodeops(zfsctl_ops_root); + if (zfsctl_ops_snapdir) + vn_freevnodeops(zfsctl_ops_snapdir); + if (zfsctl_ops_snapshot) + vn_freevnodeops(zfsctl_ops_snapshot); + + zfsctl_ops_root = NULL; + zfsctl_ops_snapdir = NULL; + zfsctl_ops_snapshot = NULL; +} + +/* + * Return the inode number associated with the 'snapshot' directory. + */ +/* ARGSUSED */ +static ino64_t +zfsctl_root_inode_cb(vnode_t *vp, int index) +{ + ASSERT(index == 0); + return (ZFSCTL_INO_SNAPDIR); +} + +/* + * Create the '.zfs' directory. This directory is cached as part of the VFS + * structure. This results in a hold on the vfs_t. The code in zfs_umount() + * therefore checks against a vfs_count of 2 instead of 1. This reference + * is removed when the ctldir is destroyed in the unmount. + */ +void +zfsctl_create(zfsvfs_t *zfsvfs) +{ + vnode_t *vp, *rvp; + zfsctl_node_t *zcp; + + ASSERT(zfsvfs->z_ctldir == NULL); + + vp = gfs_root_create(sizeof (zfsctl_node_t), zfsvfs->z_vfs, + zfsctl_ops_root, ZFSCTL_INO_ROOT, zfsctl_root_entries, + zfsctl_root_inode_cb, MAXNAMELEN, NULL, NULL); + zcp = vp->v_data; + zcp->zc_id = ZFSCTL_INO_ROOT; + + VERIFY(VFS_ROOT(zfsvfs->z_vfs, &rvp) == 0); + ZFS_TIME_DECODE(&zcp->zc_cmtime, VTOZ(rvp)->z_phys->zp_crtime); + VN_RELE(rvp); + + /* + * We're only faking the fact that we have a root of a filesystem for + * the sake of the GFS interfaces. Undo the flag manipulation it did + * for us. + */ + vp->v_flag &= ~(VROOT | VNOCACHE | VNOMAP | VNOSWAP | VNOMOUNT); + + zfsvfs->z_ctldir = vp; +} + +/* + * Destroy the '.zfs' directory. Only called when the filesystem is unmounted. + * There might still be more references if we were force unmounted, but only + * new zfs_inactive() calls can occur and they don't reference .zfs + */ +void +zfsctl_destroy(zfsvfs_t *zfsvfs) +{ + VN_RELE(zfsvfs->z_ctldir); + zfsvfs->z_ctldir = NULL; +} + +/* + * Given a root znode, retrieve the associated .zfs directory. + * Add a hold to the vnode and return it. + */ +vnode_t * +zfsctl_root(znode_t *zp) +{ + ASSERT(zfs_has_ctldir(zp)); + VN_HOLD(zp->z_zfsvfs->z_ctldir); + return (zp->z_zfsvfs->z_ctldir); +} + +/* + * Common open routine. Disallow any write access. + */ +/* ARGSUSED */ +static int +zfsctl_common_open(vnode_t **vpp, int flags, cred_t *cr, caller_context_t *ct) +{ + if (flags & FWRITE) + return (EACCES); + + return (0); +} + +/* + * Common close routine. Nothing to do here. + */ +/* ARGSUSED */ +static int +zfsctl_common_close(vnode_t *vpp, int flags, int count, offset_t off, + cred_t *cr, caller_context_t *ct) +{ + return (0); +} + +/* + * Common access routine. Disallow writes. + */ +/* ARGSUSED */ +static int +zfsctl_common_access(vnode_t *vp, int mode, int flags, cred_t *cr, + caller_context_t *ct) +{ + if (mode & VWRITE) + return (EACCES); + + return (0); +} + +/* + * Common getattr function. Fill in basic information. + */ +static void +zfsctl_common_getattr(vnode_t *vp, vattr_t *vap) +{ + zfsctl_node_t *zcp = vp->v_data; + timestruc_t now; + + vap->va_uid = 0; + vap->va_gid = 0; + vap->va_rdev = 0; + /* + * We are a purly virtual object, so we have no + * blocksize or allocated blocks. + */ + vap->va_blksize = 0; + vap->va_nblocks = 0; + vap->va_seq = 0; + vap->va_fsid = vp->v_vfsp->vfs_dev; + vap->va_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP | + S_IROTH | S_IXOTH; + vap->va_type = VDIR; + /* + * We live in the now (for atime). + */ + gethrestime(&now); + vap->va_atime = now; + vap->va_mtime = vap->va_ctime = zcp->zc_cmtime; +} + +/*ARGSUSED*/ +static int +zfsctl_common_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) +{ + zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; + zfsctl_node_t *zcp = vp->v_data; + uint64_t object = zcp->zc_id; + zfid_short_t *zfid; + int i; + + ZFS_ENTER(zfsvfs); + + if (fidp->fid_len < SHORT_FID_LEN) { + fidp->fid_len = SHORT_FID_LEN; + ZFS_EXIT(zfsvfs); + return (ENOSPC); + } + + zfid = (zfid_short_t *)fidp; + + zfid->zf_len = SHORT_FID_LEN; + + for (i = 0; i < sizeof (zfid->zf_object); i++) + zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); + + /* .zfs znodes always have a generation number of 0 */ + for (i = 0; i < sizeof (zfid->zf_gen); i++) + zfid->zf_gen[i] = 0; + + ZFS_EXIT(zfsvfs); + return (0); +} + +/* + * .zfs inode namespace + * + * We need to generate unique inode numbers for all files and directories + * within the .zfs pseudo-filesystem. We use the following scheme: + * + * ENTRY ZFSCTL_INODE + * .zfs 1 + * .zfs/snapshot 2 + * .zfs/snapshot/ objectid(snap) + */ + +#define ZFSCTL_INO_SNAP(id) (id) + +/* + * Get root directory attributes. + */ +/* ARGSUSED */ +static int +zfsctl_root_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, + caller_context_t *ct) +{ + zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; + + ZFS_ENTER(zfsvfs); + vap->va_nodeid = ZFSCTL_INO_ROOT; + vap->va_nlink = vap->va_size = NROOT_ENTRIES; + + zfsctl_common_getattr(vp, vap); + ZFS_EXIT(zfsvfs); + + return (0); +} + +/* + * Special case the handling of "..". + */ +/* ARGSUSED */ +int +zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp, + int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, + int *direntflags, pathname_t *realpnp) +{ + zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data; + int err; + + /* + * No extended attributes allowed under .zfs + */ + if (flags & LOOKUP_XATTR) + return (EINVAL); + + ZFS_ENTER(zfsvfs); + + if (strcmp(nm, "..") == 0) { + err = VFS_ROOT(dvp->v_vfsp, vpp); + } else { + err = gfs_vop_lookup(dvp, nm, vpp, pnp, flags, rdir, + cr, ct, direntflags, realpnp); + } + + ZFS_EXIT(zfsvfs); + + return (err); +} + +static const fs_operation_def_t zfsctl_tops_root[] = { + { VOPNAME_OPEN, { .vop_open = zfsctl_common_open } }, + { VOPNAME_CLOSE, { .vop_close = zfsctl_common_close } }, + { VOPNAME_IOCTL, { .error = fs_inval } }, + { VOPNAME_GETATTR, { .vop_getattr = zfsctl_root_getattr } }, + { VOPNAME_ACCESS, { .vop_access = zfsctl_common_access } }, + { VOPNAME_READDIR, { .vop_readdir = gfs_vop_readdir } }, + { VOPNAME_LOOKUP, { .vop_lookup = zfsctl_root_lookup } }, + { VOPNAME_SEEK, { .vop_seek = fs_seek } }, + { VOPNAME_INACTIVE, { .vop_inactive = gfs_vop_inactive } }, + { VOPNAME_FID, { .vop_fid = zfsctl_common_fid } }, + { NULL } +}; + +static int +zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname) +{ + objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os; + + dmu_objset_name(os, zname); + if (strlen(zname) + 1 + strlen(name) >= len) + return (ENAMETOOLONG); + (void) strcat(zname, "@"); + (void) strcat(zname, name); + return (0); +} + +static int +zfsctl_unmount_snap(zfs_snapentry_t *sep, int fflags, cred_t *cr) +{ + vnode_t *svp = sep->se_root; + int error; + + ASSERT(vn_ismntpt(svp)); + + /* this will be dropped by dounmount() */ + if ((error = vn_vfswlock(svp)) != 0) + return (error); + + VN_HOLD(svp); + error = dounmount(vn_mountedvfs(svp), fflags, cr); + if (error) { + VN_RELE(svp); + return (error); + } + VFS_RELE(svp->v_vfsp); + /* + * We can't use VN_RELE(), as that will try to invoke + * zfsctl_snapdir_inactive(), which would cause us to destroy + * the sd_lock mutex held by our caller. + */ + ASSERT(svp->v_count == 1); + gfs_vop_inactive(svp, cr, NULL); + + kmem_free(sep->se_name, strlen(sep->se_name) + 1); + kmem_free(sep, sizeof (zfs_snapentry_t)); + + return (0); +} + +static void +zfsctl_rename_snap(zfsctl_snapdir_t *sdp, zfs_snapentry_t *sep, const char *nm) +{ + avl_index_t where; + vfs_t *vfsp; + refstr_t *pathref; + char newpath[MAXNAMELEN]; + char *tail; + + ASSERT(MUTEX_HELD(&sdp->sd_lock)); + ASSERT(sep != NULL); + + vfsp = vn_mountedvfs(sep->se_root); + ASSERT(vfsp != NULL); + + vfs_lock_wait(vfsp); + + /* + * Change the name in the AVL tree. + */ + avl_remove(&sdp->sd_snaps, sep); + kmem_free(sep->se_name, strlen(sep->se_name) + 1); + sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP); + (void) strcpy(sep->se_name, nm); + VERIFY(avl_find(&sdp->sd_snaps, sep, &where) == NULL); + avl_insert(&sdp->sd_snaps, sep, where); + + /* + * Change the current mountpoint info: + * - update the tail of the mntpoint path + * - update the tail of the resource path + */ + pathref = vfs_getmntpoint(vfsp); + (void) strncpy(newpath, refstr_value(pathref), sizeof (newpath)); + VERIFY((tail = strrchr(newpath, '/')) != NULL); + *(tail+1) = '\0'; + ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath)); + (void) strcat(newpath, nm); + refstr_rele(pathref); + vfs_setmntpoint(vfsp, newpath); + + pathref = vfs_getresource(vfsp); + (void) strncpy(newpath, refstr_value(pathref), sizeof (newpath)); + VERIFY((tail = strrchr(newpath, '@')) != NULL); + *(tail+1) = '\0'; + ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath)); + (void) strcat(newpath, nm); + refstr_rele(pathref); + vfs_setresource(vfsp, newpath); + + vfs_unlock(vfsp); +} + +/*ARGSUSED*/ +static int +zfsctl_snapdir_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, + cred_t *cr, caller_context_t *ct, int flags) +{ + zfsctl_snapdir_t *sdp = sdvp->v_data; + zfs_snapentry_t search, *sep; + zfsvfs_t *zfsvfs; + avl_index_t where; + char from[MAXNAMELEN], to[MAXNAMELEN]; + char real[MAXNAMELEN]; + int err; + + zfsvfs = sdvp->v_vfsp->vfs_data; + ZFS_ENTER(zfsvfs); + + if ((flags & FIGNORECASE) || zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { + err = dmu_snapshot_realname(zfsvfs->z_os, snm, real, + MAXNAMELEN, NULL); + if (err == 0) { + snm = real; + } else if (err != ENOTSUP) { + ZFS_EXIT(zfsvfs); + return (err); + } + } + + ZFS_EXIT(zfsvfs); + + err = zfsctl_snapshot_zname(sdvp, snm, MAXNAMELEN, from); + if (!err) + err = zfsctl_snapshot_zname(tdvp, tnm, MAXNAMELEN, to); + if (!err) + err = zfs_secpolicy_rename_perms(from, to, cr); + if (err) + return (err); + + /* + * Cannot move snapshots out of the snapdir. + */ + if (sdvp != tdvp) + return (EINVAL); + + if (strcmp(snm, tnm) == 0) + return (0); + + mutex_enter(&sdp->sd_lock); + + search.se_name = (char *)snm; + if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) == NULL) { + mutex_exit(&sdp->sd_lock); + return (ENOENT); + } + + err = dmu_objset_rename(from, to, B_FALSE); + if (err == 0) + zfsctl_rename_snap(sdp, sep, tnm); + + mutex_exit(&sdp->sd_lock); + + return (err); +} + +/* ARGSUSED */ +static int +zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr, + caller_context_t *ct, int flags) +{ + zfsctl_snapdir_t *sdp = dvp->v_data; + zfs_snapentry_t *sep; + zfs_snapentry_t search; + zfsvfs_t *zfsvfs; + char snapname[MAXNAMELEN]; + char real[MAXNAMELEN]; + int err; + + zfsvfs = dvp->v_vfsp->vfs_data; + ZFS_ENTER(zfsvfs); + + if ((flags & FIGNORECASE) || zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { + + err = dmu_snapshot_realname(zfsvfs->z_os, name, real, + MAXNAMELEN, NULL); + if (err == 0) { + name = real; + } else if (err != ENOTSUP) { + ZFS_EXIT(zfsvfs); + return (err); + } + } + + ZFS_EXIT(zfsvfs); + + err = zfsctl_snapshot_zname(dvp, name, MAXNAMELEN, snapname); + if (!err) + err = zfs_secpolicy_destroy_perms(snapname, cr); + if (err) + return (err); + + mutex_enter(&sdp->sd_lock); + + search.se_name = name; + sep = avl_find(&sdp->sd_snaps, &search, NULL); + if (sep) { + avl_remove(&sdp->sd_snaps, sep); + err = zfsctl_unmount_snap(sep, MS_FORCE, cr); + if (err) + avl_add(&sdp->sd_snaps, sep); + else + err = dmu_objset_destroy(snapname); + } else { + err = ENOENT; + } + + mutex_exit(&sdp->sd_lock); + + return (err); +} + +/* + * This creates a snapshot under '.zfs/snapshot'. + */ +/* ARGSUSED */ +static int +zfsctl_snapdir_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, + cred_t *cr, caller_context_t *cc, int flags, vsecattr_t *vsecp) +{ + zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data; + char name[MAXNAMELEN]; + int err; + static enum symfollow follow = NO_FOLLOW; + static enum uio_seg seg = UIO_SYSSPACE; + + dmu_objset_name(zfsvfs->z_os, name); + + *vpp = NULL; + + err = zfs_secpolicy_snapshot_perms(name, cr); + if (err) + return (err); + + if (err == 0) { + err = dmu_objset_snapshot(name, dirname, B_FALSE); + if (err) + return (err); + err = lookupnameat(dirname, seg, follow, NULL, vpp, dvp); + } + + return (err); +} + +/* + * Lookup entry point for the 'snapshot' directory. Try to open the + * snapshot if it exist, creating the pseudo filesystem vnode as necessary. + * Perform a mount of the associated dataset on top of the vnode. + */ +/* ARGSUSED */ +static int +zfsctl_snapdir_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp, + int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, + int *direntflags, pathname_t *realpnp) +{ + zfsctl_snapdir_t *sdp = dvp->v_data; + objset_t *snap; + char snapname[MAXNAMELEN]; + char real[MAXNAMELEN]; + char *mountpoint; + zfs_snapentry_t *sep, search; + struct mounta margs; + vfs_t *vfsp; + size_t mountpoint_len; + avl_index_t where; + zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data; + int err; + + /* + * No extended attributes allowed under .zfs + */ + if (flags & LOOKUP_XATTR) + return (EINVAL); + + ASSERT(dvp->v_type == VDIR); + + if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0) + return (0); + + /* + * If we get a recursive call, that means we got called + * from the domount() code while it was trying to look up the + * spec (which looks like a local path for zfs). We need to + * add some flag to domount() to tell it not to do this lookup. + */ + if (MUTEX_HELD(&sdp->sd_lock)) + return (ENOENT); + + ZFS_ENTER(zfsvfs); + + if (flags & FIGNORECASE) { + boolean_t conflict = B_FALSE; + + err = dmu_snapshot_realname(zfsvfs->z_os, nm, real, + MAXNAMELEN, &conflict); + if (err == 0) { + nm = real; + } else if (err != ENOTSUP) { + ZFS_EXIT(zfsvfs); + return (err); + } + if (realpnp) + (void) strlcpy(realpnp->pn_buf, nm, + realpnp->pn_bufsize); + if (conflict && direntflags) + *direntflags = ED_CASE_CONFLICT; + } + + mutex_enter(&sdp->sd_lock); + search.se_name = (char *)nm; + if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) != NULL) { + *vpp = sep->se_root; + VN_HOLD(*vpp); + err = traverse(vpp); + if (err) { + VN_RELE(*vpp); + *vpp = NULL; + } else if (*vpp == sep->se_root) { + /* + * The snapshot was unmounted behind our backs, + * try to remount it. + */ + goto domount; + } else { + /* + * VROOT was set during the traverse call. We need + * to clear it since we're pretending to be part + * of our parent's vfs. + */ + (*vpp)->v_flag &= ~VROOT; + } + mutex_exit(&sdp->sd_lock); + ZFS_EXIT(zfsvfs); + return (err); + } + + /* + * The requested snapshot is not currently mounted, look it up. + */ + err = zfsctl_snapshot_zname(dvp, nm, MAXNAMELEN, snapname); + if (err) { + mutex_exit(&sdp->sd_lock); + ZFS_EXIT(zfsvfs); + return (err); + } + if (dmu_objset_open(snapname, DMU_OST_ZFS, + DS_MODE_STANDARD | DS_MODE_READONLY, &snap) != 0) { + mutex_exit(&sdp->sd_lock); + ZFS_EXIT(zfsvfs); + return (ENOENT); + } + + sep = kmem_alloc(sizeof (zfs_snapentry_t), KM_SLEEP); + sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP); + (void) strcpy(sep->se_name, nm); + *vpp = sep->se_root = zfsctl_snapshot_mknode(dvp, dmu_objset_id(snap)); + avl_insert(&sdp->sd_snaps, sep, where); + + dmu_objset_close(snap); +domount: + mountpoint_len = strlen(refstr_value(dvp->v_vfsp->vfs_mntpt)) + + strlen("/.zfs/snapshot/") + strlen(nm) + 1; + mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP); + (void) snprintf(mountpoint, mountpoint_len, "%s/.zfs/snapshot/%s", + refstr_value(dvp->v_vfsp->vfs_mntpt), nm); + + margs.spec = snapname; + margs.dir = mountpoint; + margs.flags = MS_SYSSPACE | MS_NOMNTTAB; + margs.fstype = "zfs"; + margs.dataptr = NULL; + margs.datalen = 0; + margs.optptr = NULL; + margs.optlen = 0; + + err = domount("zfs", &margs, *vpp, kcred, &vfsp); + kmem_free(mountpoint, mountpoint_len); + + if (err == 0) { + /* + * Return the mounted root rather than the covered mount point. + * Takes the GFS vnode at .zfs/snapshot/ and returns + * the ZFS vnode mounted on top of the GFS node. This ZFS + * vnode is the root the newly created vfsp. + */ + VFS_RELE(vfsp); + err = traverse(vpp); + } + + if (err == 0) { + /* + * Fix up the root vnode mounted on .zfs/snapshot/. + * + * This is where we lie about our v_vfsp in order to + * make .zfs/snapshot/ accessible over NFS + * without requiring manual mounts of . + */ + ASSERT(VTOZ(*vpp)->z_zfsvfs != zfsvfs); + VTOZ(*vpp)->z_zfsvfs->z_parent = zfsvfs; + (*vpp)->v_vfsp = zfsvfs->z_vfs; + (*vpp)->v_flag &= ~VROOT; + } + mutex_exit(&sdp->sd_lock); + ZFS_EXIT(zfsvfs); + + /* + * If we had an error, drop our hold on the vnode and + * zfsctl_snapshot_inactive() will clean up. + */ + if (err) { + VN_RELE(*vpp); + *vpp = NULL; + } + return (err); +} + +/* ARGSUSED */ +static int +zfsctl_snapdir_readdir_cb(vnode_t *vp, void *dp, int *eofp, + offset_t *offp, offset_t *nextp, void *data, int flags) +{ + zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; + char snapname[MAXNAMELEN]; + uint64_t id, cookie; + boolean_t case_conflict; + int error; + + ZFS_ENTER(zfsvfs); + + cookie = *offp; + error = dmu_snapshot_list_next(zfsvfs->z_os, MAXNAMELEN, snapname, &id, + &cookie, &case_conflict); + if (error) { + ZFS_EXIT(zfsvfs); + if (error == ENOENT) { + *eofp = 1; + return (0); + } + return (error); + } + + if (flags & V_RDDIR_ENTFLAGS) { + edirent_t *eodp = dp; + + (void) strcpy(eodp->ed_name, snapname); + eodp->ed_ino = ZFSCTL_INO_SNAP(id); + eodp->ed_eflags = case_conflict ? ED_CASE_CONFLICT : 0; + } else { + struct dirent64 *odp = dp; + + (void) strcpy(odp->d_name, snapname); + odp->d_ino = ZFSCTL_INO_SNAP(id); + } + *nextp = cookie; + + ZFS_EXIT(zfsvfs); + + return (0); +} + +/* + * pvp is the '.zfs' directory (zfsctl_node_t). + * Creates vp, which is '.zfs/snapshot' (zfsctl_snapdir_t). + * + * This function is the callback to create a GFS vnode for '.zfs/snapshot' + * when a lookup is performed on .zfs for "snapshot". + */ +vnode_t * +zfsctl_mknode_snapdir(vnode_t *pvp) +{ + vnode_t *vp; + zfsctl_snapdir_t *sdp; + + vp = gfs_dir_create(sizeof (zfsctl_snapdir_t), pvp, + zfsctl_ops_snapdir, NULL, NULL, MAXNAMELEN, + zfsctl_snapdir_readdir_cb, NULL); + sdp = vp->v_data; + sdp->sd_node.zc_id = ZFSCTL_INO_SNAPDIR; + sdp->sd_node.zc_cmtime = ((zfsctl_node_t *)pvp->v_data)->zc_cmtime; + mutex_init(&sdp->sd_lock, NULL, MUTEX_DEFAULT, NULL); + avl_create(&sdp->sd_snaps, snapentry_compare, + sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, se_node)); + return (vp); +} + +/* ARGSUSED */ +static int +zfsctl_snapdir_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, + caller_context_t *ct) +{ + zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; + zfsctl_snapdir_t *sdp = vp->v_data; + + ZFS_ENTER(zfsvfs); + zfsctl_common_getattr(vp, vap); + vap->va_nodeid = gfs_file_inode(vp); + vap->va_nlink = vap->va_size = avl_numnodes(&sdp->sd_snaps) + 2; + ZFS_EXIT(zfsvfs); + + return (0); +} + +/* ARGSUSED */ +static void +zfsctl_snapdir_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) +{ + zfsctl_snapdir_t *sdp = vp->v_data; + void *private; + + private = gfs_dir_inactive(vp); + if (private != NULL) { + ASSERT(avl_numnodes(&sdp->sd_snaps) == 0); + mutex_destroy(&sdp->sd_lock); + avl_destroy(&sdp->sd_snaps); + kmem_free(private, sizeof (zfsctl_snapdir_t)); + } +} + +static const fs_operation_def_t zfsctl_tops_snapdir[] = { + { VOPNAME_OPEN, { .vop_open = zfsctl_common_open } }, + { VOPNAME_CLOSE, { .vop_close = zfsctl_common_close } }, + { VOPNAME_IOCTL, { .error = fs_inval } }, + { VOPNAME_GETATTR, { .vop_getattr = zfsctl_snapdir_getattr } }, + { VOPNAME_ACCESS, { .vop_access = zfsctl_common_access } }, + { VOPNAME_RENAME, { .vop_rename = zfsctl_snapdir_rename } }, + { VOPNAME_RMDIR, { .vop_rmdir = zfsctl_snapdir_remove } }, + { VOPNAME_MKDIR, { .vop_mkdir = zfsctl_snapdir_mkdir } }, + { VOPNAME_READDIR, { .vop_readdir = gfs_vop_readdir } }, + { VOPNAME_LOOKUP, { .vop_lookup = zfsctl_snapdir_lookup } }, + { VOPNAME_SEEK, { .vop_seek = fs_seek } }, + { VOPNAME_INACTIVE, { .vop_inactive = zfsctl_snapdir_inactive } }, + { VOPNAME_FID, { .vop_fid = zfsctl_common_fid } }, + { NULL } +}; + +/* + * pvp is the GFS vnode '.zfs/snapshot'. + * + * This creates a GFS node under '.zfs/snapshot' representing each + * snapshot. This newly created GFS node is what we mount snapshot + * vfs_t's ontop of. + */ +static vnode_t * +zfsctl_snapshot_mknode(vnode_t *pvp, uint64_t objset) +{ + vnode_t *vp; + zfsctl_node_t *zcp; + + vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp, + zfsctl_ops_snapshot, NULL, NULL, MAXNAMELEN, NULL, NULL); + zcp = vp->v_data; + zcp->zc_id = objset; + VFS_HOLD(vp->v_vfsp); + + return (vp); +} + +static void +zfsctl_snapshot_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) +{ + zfsctl_snapdir_t *sdp; + zfs_snapentry_t *sep, *next; + vnode_t *dvp; + + VERIFY(gfs_dir_lookup(vp, "..", &dvp, cr, 0, NULL, NULL) == 0); + sdp = dvp->v_data; + + mutex_enter(&sdp->sd_lock); + + if (vp->v_count > 1) { + mutex_exit(&sdp->sd_lock); + return; + } + ASSERT(!vn_ismntpt(vp)); + + sep = avl_first(&sdp->sd_snaps); + while (sep != NULL) { + next = AVL_NEXT(&sdp->sd_snaps, sep); + + if (sep->se_root == vp) { + avl_remove(&sdp->sd_snaps, sep); + kmem_free(sep->se_name, strlen(sep->se_name) + 1); + kmem_free(sep, sizeof (zfs_snapentry_t)); + break; + } + sep = next; + } + ASSERT(sep != NULL); + + mutex_exit(&sdp->sd_lock); + VN_RELE(dvp); + VFS_RELE(vp->v_vfsp); + + /* + * Dispose of the vnode for the snapshot mount point. + * This is safe to do because once this entry has been removed + * from the AVL tree, it can't be found again, so cannot become + * "active". If we lookup the same name again we will end up + * creating a new vnode. + */ + gfs_vop_inactive(vp, cr, ct); +} + + +/* + * These VP's should never see the light of day. They should always + * be covered. + */ +static const fs_operation_def_t zfsctl_tops_snapshot[] = { + VOPNAME_INACTIVE, { .vop_inactive = zfsctl_snapshot_inactive }, + NULL, NULL +}; + +int +zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp) +{ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + vnode_t *dvp, *vp; + zfsctl_snapdir_t *sdp; + zfsctl_node_t *zcp; + zfs_snapentry_t *sep; + int error; + + ASSERT(zfsvfs->z_ctldir != NULL); + error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp, + NULL, 0, NULL, kcred, NULL, NULL, NULL); + if (error != 0) + return (error); + sdp = dvp->v_data; + + mutex_enter(&sdp->sd_lock); + sep = avl_first(&sdp->sd_snaps); + while (sep != NULL) { + vp = sep->se_root; + zcp = vp->v_data; + if (zcp->zc_id == objsetid) + break; + + sep = AVL_NEXT(&sdp->sd_snaps, sep); + } + + if (sep != NULL) { + VN_HOLD(vp); + /* + * Return the mounted root rather than the covered mount point. + * Takes the GFS vnode at .zfs/snapshot/ + * and returns the ZFS vnode mounted on top of the GFS node. + * This ZFS vnode is the root of the vfs for objset 'objsetid'. + */ + error = traverse(&vp); + if (error == 0) { + if (vp == sep->se_root) + error = EINVAL; + else + *zfsvfsp = VTOZ(vp)->z_zfsvfs; + } + mutex_exit(&sdp->sd_lock); + VN_RELE(vp); + } else { + error = EINVAL; + mutex_exit(&sdp->sd_lock); + } + + VN_RELE(dvp); + + return (error); +} + +/* + * Unmount any snapshots for the given filesystem. This is called from + * zfs_umount() - if we have a ctldir, then go through and unmount all the + * snapshots. + */ +int +zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr) +{ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + vnode_t *dvp; + zfsctl_snapdir_t *sdp; + zfs_snapentry_t *sep, *next; + int error; + + ASSERT(zfsvfs->z_ctldir != NULL); + error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp, + NULL, 0, NULL, cr, NULL, NULL, NULL); + if (error != 0) + return (error); + sdp = dvp->v_data; + + mutex_enter(&sdp->sd_lock); + + sep = avl_first(&sdp->sd_snaps); + while (sep != NULL) { + next = AVL_NEXT(&sdp->sd_snaps, sep); + + /* + * If this snapshot is not mounted, then it must + * have just been unmounted by somebody else, and + * will be cleaned up by zfsctl_snapdir_inactive(). + */ + if (vn_ismntpt(sep->se_root)) { + avl_remove(&sdp->sd_snaps, sep); + error = zfsctl_unmount_snap(sep, fflags, cr); + if (error) { + avl_add(&sdp->sd_snaps, sep); + break; + } + } + sep = next; + } + + mutex_exit(&sdp->sd_lock); + VN_RELE(dvp); + + return (error); +} diff --git a/zfs/lib/libdmu-ctl/zfs_dir.c b/zfs/lib/libdmu-ctl/zfs_dir.c new file mode 100644 index 0000000000..6f22e2ad1a --- /dev/null +++ b/zfs/lib/libdmu-ctl/zfs_dir.c @@ -0,0 +1,968 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "@(#)zfs_dir.c 1.25 08/04/27 SMI" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "fs/fs_subr.h" +#include +#include +#include +#include +#include +#include +#include + +/* + * zfs_match_find() is used by zfs_dirent_lock() to peform zap lookups + * of names after deciding which is the appropriate lookup interface. + */ +static int +zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, char *name, boolean_t exact, + boolean_t update, int *deflags, pathname_t *rpnp, uint64_t *zoid) +{ + int error; + + if (zfsvfs->z_norm) { + matchtype_t mt = MT_FIRST; + boolean_t conflict = B_FALSE; + size_t bufsz = 0; + char *buf = NULL; + + if (rpnp) { + buf = rpnp->pn_buf; + bufsz = rpnp->pn_bufsize; + } + if (exact) + mt = MT_EXACT; + /* + * In the non-mixed case we only expect there would ever + * be one match, but we need to use the normalizing lookup. + */ + error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1, + zoid, mt, buf, bufsz, &conflict); + if (!error && deflags) + *deflags = conflict ? ED_CASE_CONFLICT : 0; + } else { + error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid); + } + *zoid = ZFS_DIRENT_OBJ(*zoid); + + if (error == ENOENT && update) + dnlc_update(ZTOV(dzp), name, DNLC_NO_VNODE); + + return (error); +} + +/* + * Lock a directory entry. A dirlock on protects that name + * in dzp's directory zap object. As long as you hold a dirlock, you can + * assume two things: (1) dzp cannot be reaped, and (2) no other thread + * can change the zap entry for (i.e. link or unlink) this name. + * + * Input arguments: + * dzp - znode for directory + * name - name of entry to lock + * flag - ZNEW: if the entry already exists, fail with EEXIST. + * ZEXISTS: if the entry does not exist, fail with ENOENT. + * ZSHARED: allow concurrent access with other ZSHARED callers. + * ZXATTR: we want dzp's xattr directory + * ZCILOOK: On a mixed sensitivity file system, + * this lookup should be case-insensitive. + * ZCIEXACT: On a purely case-insensitive file system, + * this lookup should be case-sensitive. + * ZRENAMING: we are locking for renaming, force narrow locks + * + * Output arguments: + * zpp - pointer to the znode for the entry (NULL if there isn't one) + * dlpp - pointer to the dirlock for this entry (NULL on error) + * direntflags - (case-insensitive lookup only) + * flags if multiple case-sensitive matches exist in directory + * realpnp - (case-insensitive lookup only) + * actual name matched within the directory + * + * Return value: 0 on success or errno on failure. + * + * NOTE: Always checks for, and rejects, '.' and '..'. + * NOTE: For case-insensitive file systems we take wide locks (see below), + * but return znode pointers to a single match. + */ +int +zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp, + int flag, int *direntflags, pathname_t *realpnp) +{ + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zfs_dirlock_t *dl; + boolean_t update; + boolean_t exact; + uint64_t zoid; + vnode_t *vp = NULL; + int error = 0; + int cmpflags; + + *zpp = NULL; + *dlpp = NULL; + + /* + * Verify that we are not trying to lock '.', '..', or '.zfs' + */ + if (name[0] == '.' && + (name[1] == '\0' || (name[1] == '.' && name[2] == '\0')) || + zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) + return (EEXIST); + + /* + * Case sensitivity and normalization preferences are set when + * the file system is created. These are stored in the + * zfsvfs->z_case and zfsvfs->z_norm fields. These choices + * affect what vnodes can be cached in the DNLC, how we + * perform zap lookups, and the "width" of our dirlocks. + * + * A normal dirlock locks a single name. Note that with + * normalization a name can be composed multiple ways, but + * when normalized, these names all compare equal. A wide + * dirlock locks multiple names. We need these when the file + * system is supporting mixed-mode access. It is sometimes + * necessary to lock all case permutations of file name at + * once so that simultaneous case-insensitive/case-sensitive + * behaves as rationally as possible. + */ + + /* + * Decide if exact matches should be requested when performing + * a zap lookup on file systems supporting case-insensitive + * access. + */ + exact = + ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE) && (flag & ZCIEXACT)) || + ((zfsvfs->z_case == ZFS_CASE_MIXED) && !(flag & ZCILOOK)); + + /* + * Only look in or update the DNLC if we are looking for the + * name on a file system that does not require normalization + * or case folding. We can also look there if we happen to be + * on a non-normalizing, mixed sensitivity file system IF we + * are looking for the exact name. + * + * Maybe can add TO-UPPERed version of name to dnlc in ci-only + * case for performance improvement? + */ + update = !zfsvfs->z_norm || + ((zfsvfs->z_case == ZFS_CASE_MIXED) && + !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER) && !(flag & ZCILOOK)); + + /* + * ZRENAMING indicates we are in a situation where we should + * take narrow locks regardless of the file system's + * preferences for normalizing and case folding. This will + * prevent us deadlocking trying to grab the same wide lock + * twice if the two names happen to be case-insensitive + * matches. + */ + if (flag & ZRENAMING) + cmpflags = 0; + else + cmpflags = zfsvfs->z_norm; + + /* + * Wait until there are no locks on this name. + */ + rw_enter(&dzp->z_name_lock, RW_READER); + mutex_enter(&dzp->z_lock); + for (;;) { + if (dzp->z_unlinked) { + mutex_exit(&dzp->z_lock); + rw_exit(&dzp->z_name_lock); + return (ENOENT); + } + for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next) { + if ((u8_strcmp(name, dl->dl_name, 0, cmpflags, + U8_UNICODE_LATEST, &error) == 0) || error != 0) + break; + } + if (error != 0) { + mutex_exit(&dzp->z_lock); + rw_exit(&dzp->z_name_lock); + return (ENOENT); + } + if (dl == NULL) { + /* + * Allocate a new dirlock and add it to the list. + */ + dl = kmem_alloc(sizeof (zfs_dirlock_t), KM_SLEEP); + cv_init(&dl->dl_cv, NULL, CV_DEFAULT, NULL); + dl->dl_name = name; + dl->dl_sharecnt = 0; + dl->dl_namesize = 0; + dl->dl_dzp = dzp; + dl->dl_next = dzp->z_dirlocks; + dzp->z_dirlocks = dl; + break; + } + if ((flag & ZSHARED) && dl->dl_sharecnt != 0) + break; + cv_wait(&dl->dl_cv, &dzp->z_lock); + } + + if ((flag & ZSHARED) && ++dl->dl_sharecnt > 1 && dl->dl_namesize == 0) { + /* + * We're the second shared reference to dl. Make a copy of + * dl_name in case the first thread goes away before we do. + * Note that we initialize the new name before storing its + * pointer into dl_name, because the first thread may load + * dl->dl_name at any time. He'll either see the old value, + * which is his, or the new shared copy; either is OK. + */ + dl->dl_namesize = strlen(dl->dl_name) + 1; + name = kmem_alloc(dl->dl_namesize, KM_SLEEP); + bcopy(dl->dl_name, name, dl->dl_namesize); + dl->dl_name = name; + } + + mutex_exit(&dzp->z_lock); + + /* + * We have a dirlock on the name. (Note that it is the dirlock, + * not the dzp's z_lock, that protects the name in the zap object.) + * See if there's an object by this name; if so, put a hold on it. + */ + if (flag & ZXATTR) { + zoid = dzp->z_phys->zp_xattr; + error = (zoid == 0 ? ENOENT : 0); + } else { + if (update) + vp = dnlc_lookup(ZTOV(dzp), name); + if (vp == DNLC_NO_VNODE) { + VN_RELE(vp); + error = ENOENT; + } else if (vp) { + if (flag & ZNEW) { + zfs_dirent_unlock(dl); + VN_RELE(vp); + return (EEXIST); + } + *dlpp = dl; + *zpp = VTOZ(vp); + return (0); + } else { + error = zfs_match_find(zfsvfs, dzp, name, exact, + update, direntflags, realpnp, &zoid); + } + } + if (error) { + if (error != ENOENT || (flag & ZEXISTS)) { + zfs_dirent_unlock(dl); + return (error); + } + } else { + if (flag & ZNEW) { + zfs_dirent_unlock(dl); + return (EEXIST); + } + error = zfs_zget(zfsvfs, zoid, zpp); + if (error) { + zfs_dirent_unlock(dl); + return (error); + } + if (!(flag & ZXATTR) && update) + dnlc_update(ZTOV(dzp), name, ZTOV(*zpp)); + } + + *dlpp = dl; + + return (0); +} + +/* + * Unlock this directory entry and wake anyone who was waiting for it. + */ +void +zfs_dirent_unlock(zfs_dirlock_t *dl) +{ + znode_t *dzp = dl->dl_dzp; + zfs_dirlock_t **prev_dl, *cur_dl; + + mutex_enter(&dzp->z_lock); + rw_exit(&dzp->z_name_lock); + if (dl->dl_sharecnt > 1) { + dl->dl_sharecnt--; + mutex_exit(&dzp->z_lock); + return; + } + prev_dl = &dzp->z_dirlocks; + while ((cur_dl = *prev_dl) != dl) + prev_dl = &cur_dl->dl_next; + *prev_dl = dl->dl_next; + cv_broadcast(&dl->dl_cv); + mutex_exit(&dzp->z_lock); + + if (dl->dl_namesize != 0) + kmem_free(dl->dl_name, dl->dl_namesize); + cv_destroy(&dl->dl_cv); + kmem_free(dl, sizeof (*dl)); +} + +/* + * Look up an entry in a directory. + * + * NOTE: '.' and '..' are handled as special cases because + * no directory entries are actually stored for them. If this is + * the root of a filesystem, then '.zfs' is also treated as a + * special pseudo-directory. + */ +int +zfs_dirlook(znode_t *dzp, char *name, vnode_t **vpp, int flags, + int *deflg, pathname_t *rpnp) +{ + zfs_dirlock_t *dl; + znode_t *zp; + int error = 0; + + if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) { + *vpp = ZTOV(dzp); + VN_HOLD(*vpp); + } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) { + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + /* + * If we are a snapshot mounted under .zfs, return + * the vp for the snapshot directory. + */ + if (dzp->z_phys->zp_parent == dzp->z_id && + zfsvfs->z_parent != zfsvfs) { + error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir, + "snapshot", vpp, NULL, 0, NULL, kcred, + NULL, NULL, NULL); + return (error); + } + rw_enter(&dzp->z_parent_lock, RW_READER); + error = zfs_zget(zfsvfs, dzp->z_phys->zp_parent, &zp); + if (error == 0) + *vpp = ZTOV(zp); + rw_exit(&dzp->z_parent_lock); + } else if (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) { + *vpp = zfsctl_root(dzp); + } else { + int zf; + + zf = ZEXISTS | ZSHARED; + if (flags & FIGNORECASE) + zf |= ZCILOOK; + + error = zfs_dirent_lock(&dl, dzp, name, &zp, zf, deflg, rpnp); + if (error == 0) { + *vpp = ZTOV(zp); + zfs_dirent_unlock(dl); + dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */ + } + rpnp = NULL; + } + + if ((flags & FIGNORECASE) && rpnp && !error) + (void) strlcpy(rpnp->pn_buf, name, rpnp->pn_bufsize); + + return (error); +} + +static char * +zfs_unlinked_hexname(char namebuf[17], uint64_t x) +{ + char *name = &namebuf[16]; + const char digits[16] = "0123456789abcdef"; + + *name = '\0'; + do { + *--name = digits[x & 0xf]; + x >>= 4; + } while (x != 0); + + return (name); +} + +/* + * unlinked Set (formerly known as the "delete queue") Error Handling + * + * When dealing with the unlinked set, we dmu_tx_hold_zap(), but we + * don't specify the name of the entry that we will be manipulating. We + * also fib and say that we won't be adding any new entries to the + * unlinked set, even though we might (this is to lower the minimum file + * size that can be deleted in a full filesystem). So on the small + * chance that the nlink list is using a fat zap (ie. has more than + * 2000 entries), we *may* not pre-read a block that's needed. + * Therefore it is remotely possible for some of the assertions + * regarding the unlinked set below to fail due to i/o error. On a + * nondebug system, this will result in the space being leaked. + */ +void +zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + char obj_name[17]; + int error; + + ASSERT(zp->z_unlinked); + ASSERT3U(zp->z_phys->zp_links, ==, 0); + + error = zap_add(zfsvfs->z_os, zfsvfs->z_unlinkedobj, + zfs_unlinked_hexname(obj_name, zp->z_id), 8, 1, &zp->z_id, tx); + ASSERT3U(error, ==, 0); +} + +/* + * Clean up any znodes that had no links when we either crashed or + * (force) umounted the file system. + */ +void +zfs_unlinked_drain(zfsvfs_t *zfsvfs) +{ + zap_cursor_t zc; + zap_attribute_t zap; + dmu_object_info_t doi; + znode_t *zp; + int error; + + /* + * Interate over the contents of the unlinked set. + */ + for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj); + zap_cursor_retrieve(&zc, &zap) == 0; + zap_cursor_advance(&zc)) { + + /* + * See what kind of object we have in list + */ + + error = dmu_object_info(zfsvfs->z_os, + zap.za_first_integer, &doi); + if (error != 0) + continue; + + ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) || + (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS)); + /* + * We need to re-mark these list entries for deletion, + * so we pull them back into core and set zp->z_unlinked. + */ + error = zfs_zget(zfsvfs, zap.za_first_integer, &zp); + + /* + * We may pick up znodes that are already marked for deletion. + * This could happen during the purge of an extended attribute + * directory. All we need to do is skip over them, since they + * are already in the system marked z_unlinked. + */ + if (error != 0) + continue; + + zp->z_unlinked = B_TRUE; + VN_RELE(ZTOV(zp)); + } + zap_cursor_fini(&zc); +} + +/* + * Delete the entire contents of a directory. Return a count + * of the number of entries that could not be deleted. If we encounter + * an error, return a count of at least one so that the directory stays + * in the unlinked set. + * + * NOTE: this function assumes that the directory is inactive, + * so there is no need to lock its entries before deletion. + * Also, it assumes the directory contents is *only* regular + * files. + */ +static int +zfs_purgedir(znode_t *dzp) +{ + zap_cursor_t zc; + zap_attribute_t zap; + znode_t *xzp; + dmu_tx_t *tx; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zfs_dirlock_t dl; + int skipped = 0; + int error; + + for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id); + (error = zap_cursor_retrieve(&zc, &zap)) == 0; + zap_cursor_advance(&zc)) { + error = zfs_zget(zfsvfs, + ZFS_DIRENT_OBJ(zap.za_first_integer), &xzp); + if (error) { + skipped += 1; + continue; + } + + ASSERT((ZTOV(xzp)->v_type == VREG) || + (ZTOV(xzp)->v_type == VLNK)); + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_bonus(tx, dzp->z_id); + dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name); + dmu_tx_hold_bonus(tx, xzp->z_id); + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + VN_RELE(ZTOV(xzp)); + skipped += 1; + continue; + } + bzero(&dl, sizeof (dl)); + dl.dl_dzp = dzp; + dl.dl_name = zap.za_name; + + error = zfs_link_destroy(&dl, xzp, tx, 0, NULL); + if (error) + skipped += 1; + dmu_tx_commit(tx); + + VN_RELE(ZTOV(xzp)); + } + zap_cursor_fini(&zc); + if (error != ENOENT) + skipped += 1; + return (skipped); +} + +void +zfs_rmnode(znode_t *zp) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + objset_t *os = zfsvfs->z_os; + znode_t *xzp = NULL; + char obj_name[17]; + dmu_tx_t *tx; + uint64_t acl_obj; + int error; + + ASSERT(ZTOV(zp)->v_count == 0); + ASSERT(zp->z_phys->zp_links == 0); + + /* + * If this is an attribute directory, purge its contents. + */ + if (ZTOV(zp)->v_type == VDIR && (zp->z_phys->zp_flags & ZFS_XATTR)) { + if (zfs_purgedir(zp) != 0) { + /* + * Not enough space to delete some xattrs. + * Leave it on the unlinked set. + */ + zfs_znode_dmu_fini(zp); + zfs_znode_free(zp); + return; + } + } + + /* + * If the file has extended attributes, we're going to unlink + * the xattr dir. + */ + if (zp->z_phys->zp_xattr) { + error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp); + ASSERT(error == 0); + } + + acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj; + + /* + * Set up the transaction. + */ + tx = dmu_tx_create(os); + dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END); + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + if (xzp) { + dmu_tx_hold_bonus(tx, xzp->z_id); + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL); + } + if (acl_obj) + dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + /* + * Not enough space to delete the file. Leave it in the + * unlinked set, leaking it until the fs is remounted (at + * which point we'll call zfs_unlinked_drain() to process it). + */ + dmu_tx_abort(tx); + zfs_znode_dmu_fini(zp); + zfs_znode_free(zp); + goto out; + } + + if (xzp) { + dmu_buf_will_dirty(xzp->z_dbuf, tx); + mutex_enter(&xzp->z_lock); + xzp->z_unlinked = B_TRUE; /* mark xzp for deletion */ + xzp->z_phys->zp_links = 0; /* no more links to it */ + mutex_exit(&xzp->z_lock); + zfs_unlinked_add(xzp, tx); + } + + /* Remove this znode from the unlinked set */ + error = zap_remove(os, zfsvfs->z_unlinkedobj, + zfs_unlinked_hexname(obj_name, zp->z_id), tx); + ASSERT3U(error, ==, 0); + + zfs_znode_delete(zp, tx); + + dmu_tx_commit(tx); +out: + if (xzp) + VN_RELE(ZTOV(xzp)); +} + +static uint64_t +zfs_dirent(znode_t *zp) +{ + uint64_t de = zp->z_id; + if (zp->z_zfsvfs->z_version >= ZPL_VERSION_DIRENT_TYPE) + de |= IFTODT((zp)->z_phys->zp_mode) << 60; + return (de); +} + +/* + * Link zp into dl. Can only fail if zp has been unlinked. + */ +int +zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag) +{ + znode_t *dzp = dl->dl_dzp; + vnode_t *vp = ZTOV(zp); + uint64_t value; + int zp_is_dir = (vp->v_type == VDIR); + int error; + + dmu_buf_will_dirty(zp->z_dbuf, tx); + mutex_enter(&zp->z_lock); + + if (!(flag & ZRENAMING)) { + if (zp->z_unlinked) { /* no new links to unlinked zp */ + ASSERT(!(flag & (ZNEW | ZEXISTS))); + mutex_exit(&zp->z_lock); + return (ENOENT); + } + zp->z_phys->zp_links++; + } + zp->z_phys->zp_parent = dzp->z_id; /* dzp is now zp's parent */ + + if (!(flag & ZNEW)) + zfs_time_stamper_locked(zp, STATE_CHANGED, tx); + mutex_exit(&zp->z_lock); + + dmu_buf_will_dirty(dzp->z_dbuf, tx); + mutex_enter(&dzp->z_lock); + dzp->z_phys->zp_size++; /* one dirent added */ + dzp->z_phys->zp_links += zp_is_dir; /* ".." link from zp */ + zfs_time_stamper_locked(dzp, CONTENT_MODIFIED, tx); + mutex_exit(&dzp->z_lock); + + value = zfs_dirent(zp); + error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, dl->dl_name, + 8, 1, &value, tx); + ASSERT(error == 0); + + dnlc_update(ZTOV(dzp), dl->dl_name, vp); + + return (0); +} + +/* + * Unlink zp from dl, and mark zp for deletion if this was the last link. + * Can fail if zp is a mount point (EBUSY) or a non-empty directory (EEXIST). + * If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list. + * If it's non-NULL, we use it to indicate whether the znode needs deletion, + * and it's the caller's job to do it. + */ +int +zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag, + boolean_t *unlinkedp) +{ + znode_t *dzp = dl->dl_dzp; + vnode_t *vp = ZTOV(zp); + int zp_is_dir = (vp->v_type == VDIR); + boolean_t unlinked = B_FALSE; + int error; + + dnlc_remove(ZTOV(dzp), dl->dl_name); + + if (!(flag & ZRENAMING)) { + dmu_buf_will_dirty(zp->z_dbuf, tx); + + if (vn_vfswlock(vp)) /* prevent new mounts on zp */ + return (EBUSY); + + if (vn_ismntpt(vp)) { /* don't remove mount point */ + vn_vfsunlock(vp); + return (EBUSY); + } + + mutex_enter(&zp->z_lock); + if (zp_is_dir && !zfs_dirempty(zp)) { /* dir not empty */ + mutex_exit(&zp->z_lock); + vn_vfsunlock(vp); + return (EEXIST); + } + if (zp->z_phys->zp_links <= zp_is_dir) { + zfs_panic_recover("zfs: link count on %s is %u, " + "should be at least %u", + zp->z_vnode->v_path ? zp->z_vnode->v_path : + "", (int)zp->z_phys->zp_links, + zp_is_dir + 1); + zp->z_phys->zp_links = zp_is_dir + 1; + } + if (--zp->z_phys->zp_links == zp_is_dir) { + zp->z_unlinked = B_TRUE; + zp->z_phys->zp_links = 0; + unlinked = B_TRUE; + } else { + zfs_time_stamper_locked(zp, STATE_CHANGED, tx); + } + mutex_exit(&zp->z_lock); + vn_vfsunlock(vp); + } + + dmu_buf_will_dirty(dzp->z_dbuf, tx); + mutex_enter(&dzp->z_lock); + dzp->z_phys->zp_size--; /* one dirent removed */ + dzp->z_phys->zp_links -= zp_is_dir; /* ".." link from zp */ + zfs_time_stamper_locked(dzp, CONTENT_MODIFIED, tx); + mutex_exit(&dzp->z_lock); + + if (zp->z_zfsvfs->z_norm) { + if (((zp->z_zfsvfs->z_case == ZFS_CASE_INSENSITIVE) && + (flag & ZCIEXACT)) || + ((zp->z_zfsvfs->z_case == ZFS_CASE_MIXED) && + !(flag & ZCILOOK))) + error = zap_remove_norm(zp->z_zfsvfs->z_os, + dzp->z_id, dl->dl_name, MT_EXACT, tx); + else + error = zap_remove_norm(zp->z_zfsvfs->z_os, + dzp->z_id, dl->dl_name, MT_FIRST, tx); + } else { + error = zap_remove(zp->z_zfsvfs->z_os, + dzp->z_id, dl->dl_name, tx); + } + ASSERT(error == 0); + + if (unlinkedp != NULL) + *unlinkedp = unlinked; + else if (unlinked) + zfs_unlinked_add(zp, tx); + + return (0); +} + +/* + * Indicate whether the directory is empty. Works with or without z_lock + * held, but can only be consider a hint in the latter case. Returns true + * if only "." and ".." remain and there's no work in progress. + */ +boolean_t +zfs_dirempty(znode_t *dzp) +{ + return (dzp->z_phys->zp_size == 2 && dzp->z_dirlocks == 0); +} + +int +zfs_make_xattrdir(znode_t *zp, vattr_t *vap, vnode_t **xvpp, cred_t *cr) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + znode_t *xzp; + dmu_tx_t *tx; + int error; + zfs_fuid_info_t *fuidp = NULL; + + *xvpp = NULL; + + if (error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, 0, B_FALSE, cr)) + return (error); + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_bonus(tx, zp->z_id); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); + if (IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr))) { + if (zfsvfs->z_fuid_obj == 0) { + dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + FUID_SIZE_ESTIMATE(zfsvfs)); + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); + } else { + dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); + dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, + FUID_SIZE_ESTIMATE(zfsvfs)); + } + } + error = dmu_tx_assign(tx, zfsvfs->z_assign); + if (error) { + if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) + dmu_tx_wait(tx); + dmu_tx_abort(tx); + return (error); + } + zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, 0, NULL, &fuidp); + ASSERT(xzp->z_phys->zp_parent == zp->z_id); + dmu_buf_will_dirty(zp->z_dbuf, tx); + zp->z_phys->zp_xattr = xzp->z_id; + + (void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp, + xzp, "", NULL, fuidp, vap); + if (fuidp) + zfs_fuid_info_free(fuidp); + dmu_tx_commit(tx); + + *xvpp = ZTOV(xzp); + + return (0); +} + +/* + * Return a znode for the extended attribute directory for zp. + * ** If the directory does not already exist, it is created ** + * + * IN: zp - znode to obtain attribute directory from + * cr - credentials of caller + * flags - flags from the VOP_LOOKUP call + * + * OUT: xzpp - pointer to extended attribute znode + * + * RETURN: 0 on success + * error number on failure + */ +int +zfs_get_xattrdir(znode_t *zp, vnode_t **xvpp, cred_t *cr, int flags) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + znode_t *xzp; + zfs_dirlock_t *dl; + vattr_t va; + int error; +top: + error = zfs_dirent_lock(&dl, zp, "", &xzp, ZXATTR, NULL, NULL); + if (error) + return (error); + + if (xzp != NULL) { + *xvpp = ZTOV(xzp); + zfs_dirent_unlock(dl); + return (0); + } + + ASSERT(zp->z_phys->zp_xattr == 0); + + if (!(flags & CREATE_XATTR_DIR)) { + zfs_dirent_unlock(dl); + return (ENOENT); + } + + if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { + zfs_dirent_unlock(dl); + return (EROFS); + } + + /* + * The ability to 'create' files in an attribute + * directory comes from the write_xattr permission on the base file. + * + * The ability to 'search' an attribute directory requires + * read_xattr permission on the base file. + * + * Once in a directory the ability to read/write attributes + * is controlled by the permissions on the attribute file. + */ + va.va_mask = AT_TYPE | AT_MODE | AT_UID | AT_GID; + va.va_type = VDIR; + va.va_mode = S_IFDIR | S_ISVTX | 0777; + zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid); + + error = zfs_make_xattrdir(zp, &va, xvpp, cr); + zfs_dirent_unlock(dl); + + if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + /* NB: we already did dmu_tx_wait() if necessary */ + goto top; + } + + return (error); +} + +/* + * Decide whether it is okay to remove within a sticky directory. + * + * In sticky directories, write access is not sufficient; + * you can remove entries from a directory only if: + * + * you own the directory, + * you own the entry, + * the entry is a plain file and you have write access, + * or you are privileged (checked in secpolicy...). + * + * The function returns 0 if remove access is granted. + */ +int +zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr) +{ + uid_t uid; + uid_t downer; + uid_t fowner; + zfsvfs_t *zfsvfs = zdp->z_zfsvfs; + + if (zdp->z_zfsvfs->z_assign >= TXG_INITIAL) /* ZIL replay */ + return (0); + + if ((zdp->z_phys->zp_mode & S_ISVTX) == 0) + return (0); + + downer = zfs_fuid_map_id(zfsvfs, zdp->z_phys->zp_uid, cr, ZFS_OWNER); + fowner = zfs_fuid_map_id(zfsvfs, zp->z_phys->zp_uid, cr, ZFS_OWNER); + + if ((uid = crgetuid(cr)) == downer || uid == fowner || + (ZTOV(zp)->v_type == VREG && + zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr) == 0)) + return (0); + else + return (secpolicy_vnode_remove(cr)); +} diff --git a/zfs/lib/libdmu-ctl/zfs_fuid.c b/zfs/lib/libdmu-ctl/zfs_fuid.c new file mode 100644 index 0000000000..59c9adfe27 --- /dev/null +++ b/zfs/lib/libdmu-ctl/zfs_fuid.c @@ -0,0 +1,688 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "@(#)zfs_fuid.c 1.5 08/01/31 SMI" + +#include +#include +#include +#include +#include +#include +#include +#ifdef _KERNEL +#include +#include +#include +#include +#endif +#include + +/* + * FUID Domain table(s). + * + * The FUID table is stored as a packed nvlist of an array + * of nvlists which contain an index, domain string and offset + * + * During file system initialization the nvlist(s) are read and + * two AVL trees are created. One tree is keyed by the index number + * and the other by the domain string. Nodes are never removed from + * trees, but new entries may be added. If a new entry is added then the + * on-disk packed nvlist will also be updated. + */ + +#define FUID_IDX "fuid_idx" +#define FUID_DOMAIN "fuid_domain" +#define FUID_OFFSET "fuid_offset" +#define FUID_NVP_ARRAY "fuid_nvlist" + +typedef struct fuid_domain { + avl_node_t f_domnode; + avl_node_t f_idxnode; + ksiddomain_t *f_ksid; + uint64_t f_idx; +} fuid_domain_t; + +/* + * Compare two indexes. + */ +static int +idx_compare(const void *arg1, const void *arg2) +{ + const fuid_domain_t *node1 = arg1; + const fuid_domain_t *node2 = arg2; + + if (node1->f_idx < node2->f_idx) + return (-1); + else if (node1->f_idx > node2->f_idx) + return (1); + return (0); +} + +/* + * Compare two domain strings. + */ +static int +domain_compare(const void *arg1, const void *arg2) +{ + const fuid_domain_t *node1 = arg1; + const fuid_domain_t *node2 = arg2; + int val; + + val = strcmp(node1->f_ksid->kd_name, node2->f_ksid->kd_name); + if (val == 0) + return (0); + return (val > 0 ? 1 : -1); +} + +/* + * load initial fuid domain and idx trees. This function is used by + * both the kernel and zdb. + */ +uint64_t +zfs_fuid_table_load(objset_t *os, uint64_t fuid_obj, avl_tree_t *idx_tree, + avl_tree_t *domain_tree) +{ + dmu_buf_t *db; + uint64_t fuid_size; + + avl_create(idx_tree, idx_compare, + sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_idxnode)); + avl_create(domain_tree, domain_compare, + sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_domnode)); + + VERIFY(0 == dmu_bonus_hold(os, fuid_obj, FTAG, &db)); + fuid_size = *(uint64_t *)db->db_data; + dmu_buf_rele(db, FTAG); + + if (fuid_size) { + nvlist_t **fuidnvp; + nvlist_t *nvp = NULL; + uint_t count; + char *packed; + int i; + + packed = kmem_alloc(fuid_size, KM_SLEEP); + VERIFY(dmu_read(os, fuid_obj, 0, fuid_size, packed) == 0); + VERIFY(nvlist_unpack(packed, fuid_size, + &nvp, 0) == 0); + VERIFY(nvlist_lookup_nvlist_array(nvp, FUID_NVP_ARRAY, + &fuidnvp, &count) == 0); + + for (i = 0; i != count; i++) { + fuid_domain_t *domnode; + char *domain; + uint64_t idx; + + VERIFY(nvlist_lookup_string(fuidnvp[i], FUID_DOMAIN, + &domain) == 0); + VERIFY(nvlist_lookup_uint64(fuidnvp[i], FUID_IDX, + &idx) == 0); + + domnode = kmem_alloc(sizeof (fuid_domain_t), KM_SLEEP); + + domnode->f_idx = idx; + domnode->f_ksid = ksid_lookupdomain(domain); + avl_add(idx_tree, domnode); + avl_add(domain_tree, domnode); + } + nvlist_free(nvp); + kmem_free(packed, fuid_size); + } + return (fuid_size); +} + +void +zfs_fuid_table_destroy(avl_tree_t *idx_tree, avl_tree_t *domain_tree) +{ + fuid_domain_t *domnode; + void *cookie; + + cookie = NULL; + while (domnode = avl_destroy_nodes(domain_tree, &cookie)) + ksiddomain_rele(domnode->f_ksid); + + avl_destroy(domain_tree); + cookie = NULL; + while (domnode = avl_destroy_nodes(idx_tree, &cookie)) + kmem_free(domnode, sizeof (fuid_domain_t)); + avl_destroy(idx_tree); +} + +char * +zfs_fuid_idx_domain(avl_tree_t *idx_tree, uint32_t idx) +{ + fuid_domain_t searchnode, *findnode; + avl_index_t loc; + + searchnode.f_idx = idx; + + findnode = avl_find(idx_tree, &searchnode, &loc); + + return (findnode->f_ksid->kd_name); +} + +#ifdef _KERNEL +/* + * Load the fuid table(s) into memory. + */ +static void +zfs_fuid_init(zfsvfs_t *zfsvfs, dmu_tx_t *tx) +{ + int error = 0; + + rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER); + + if (zfsvfs->z_fuid_loaded) { + rw_exit(&zfsvfs->z_fuid_lock); + return; + } + + if (zfsvfs->z_fuid_obj == 0) { + + /* first make sure we need to allocate object */ + + error = zap_lookup(zfsvfs->z_os, MASTER_NODE_OBJ, + ZFS_FUID_TABLES, 8, 1, &zfsvfs->z_fuid_obj); + if (error == ENOENT && tx != NULL) { + zfsvfs->z_fuid_obj = dmu_object_alloc(zfsvfs->z_os, + DMU_OT_FUID, 1 << 14, DMU_OT_FUID_SIZE, + sizeof (uint64_t), tx); + VERIFY(zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, + ZFS_FUID_TABLES, sizeof (uint64_t), 1, + &zfsvfs->z_fuid_obj, tx) == 0); + } + } + + zfsvfs->z_fuid_size = zfs_fuid_table_load(zfsvfs->z_os, + zfsvfs->z_fuid_obj, &zfsvfs->z_fuid_idx, &zfsvfs->z_fuid_domain); + + zfsvfs->z_fuid_loaded = B_TRUE; + rw_exit(&zfsvfs->z_fuid_lock); +} + +/* + * Query domain table for a given domain. + * + * If domain isn't found it is added to AVL trees and + * the results are pushed out to disk. + */ +int +zfs_fuid_find_by_domain(zfsvfs_t *zfsvfs, const char *domain, char **retdomain, + dmu_tx_t *tx) +{ + fuid_domain_t searchnode, *findnode; + avl_index_t loc; + + /* + * If the dummy "nobody" domain then return an index of 0 + * to cause the created FUID to be a standard POSIX id + * for the user nobody. + */ + if (domain[0] == '\0') { + *retdomain = ""; + return (0); + } + + searchnode.f_ksid = ksid_lookupdomain(domain); + if (retdomain) { + *retdomain = searchnode.f_ksid->kd_name; + } + if (!zfsvfs->z_fuid_loaded) + zfs_fuid_init(zfsvfs, tx); + + rw_enter(&zfsvfs->z_fuid_lock, RW_READER); + findnode = avl_find(&zfsvfs->z_fuid_domain, &searchnode, &loc); + rw_exit(&zfsvfs->z_fuid_lock); + + if (findnode) { + ksiddomain_rele(searchnode.f_ksid); + return (findnode->f_idx); + } else { + fuid_domain_t *domnode; + nvlist_t *nvp; + nvlist_t **fuids; + uint64_t retidx; + size_t nvsize = 0; + char *packed; + dmu_buf_t *db; + int i = 0; + + domnode = kmem_alloc(sizeof (fuid_domain_t), KM_SLEEP); + domnode->f_ksid = searchnode.f_ksid; + + rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER); + retidx = domnode->f_idx = avl_numnodes(&zfsvfs->z_fuid_idx) + 1; + + avl_add(&zfsvfs->z_fuid_domain, domnode); + avl_add(&zfsvfs->z_fuid_idx, domnode); + /* + * Now resync the on-disk nvlist. + */ + VERIFY(nvlist_alloc(&nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + domnode = avl_first(&zfsvfs->z_fuid_domain); + fuids = kmem_alloc(retidx * sizeof (void *), KM_SLEEP); + while (domnode) { + VERIFY(nvlist_alloc(&fuids[i], + NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_uint64(fuids[i], FUID_IDX, + domnode->f_idx) == 0); + VERIFY(nvlist_add_uint64(fuids[i], + FUID_OFFSET, 0) == 0); + VERIFY(nvlist_add_string(fuids[i++], FUID_DOMAIN, + domnode->f_ksid->kd_name) == 0); + domnode = AVL_NEXT(&zfsvfs->z_fuid_domain, domnode); + } + VERIFY(nvlist_add_nvlist_array(nvp, FUID_NVP_ARRAY, + fuids, retidx) == 0); + for (i = 0; i != retidx; i++) + nvlist_free(fuids[i]); + kmem_free(fuids, retidx * sizeof (void *)); + VERIFY(nvlist_size(nvp, &nvsize, NV_ENCODE_XDR) == 0); + packed = kmem_alloc(nvsize, KM_SLEEP); + VERIFY(nvlist_pack(nvp, &packed, &nvsize, + NV_ENCODE_XDR, KM_SLEEP) == 0); + nvlist_free(nvp); + zfsvfs->z_fuid_size = nvsize; + dmu_write(zfsvfs->z_os, zfsvfs->z_fuid_obj, 0, + zfsvfs->z_fuid_size, packed, tx); + kmem_free(packed, zfsvfs->z_fuid_size); + VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, zfsvfs->z_fuid_obj, + FTAG, &db)); + dmu_buf_will_dirty(db, tx); + *(uint64_t *)db->db_data = zfsvfs->z_fuid_size; + dmu_buf_rele(db, FTAG); + + rw_exit(&zfsvfs->z_fuid_lock); + return (retidx); + } +} + +/* + * Query domain table by index, returning domain string + * + * Returns a pointer from an avl node of the domain string. + * + */ +static char * +zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx) +{ + char *domain; + + if (idx == 0 || !zfsvfs->z_use_fuids) + return (NULL); + + if (!zfsvfs->z_fuid_loaded) + zfs_fuid_init(zfsvfs, NULL); + + rw_enter(&zfsvfs->z_fuid_lock, RW_READER); + domain = zfs_fuid_idx_domain(&zfsvfs->z_fuid_idx, idx); + rw_exit(&zfsvfs->z_fuid_lock); + + ASSERT(domain); + return (domain); +} + +void +zfs_fuid_map_ids(znode_t *zp, cred_t *cr, uid_t *uidp, uid_t *gidp) +{ + *uidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_phys->zp_uid, + cr, ZFS_OWNER); + *gidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_phys->zp_gid, + cr, ZFS_GROUP); +} + +uid_t +zfs_fuid_map_id(zfsvfs_t *zfsvfs, uint64_t fuid, + cred_t *cr, zfs_fuid_type_t type) +{ + uint32_t index = FUID_INDEX(fuid); + char *domain; + uid_t id; + + if (index == 0) + return (fuid); + + domain = zfs_fuid_find_by_idx(zfsvfs, index); + ASSERT(domain != NULL); + + if (type == ZFS_OWNER || type == ZFS_ACE_USER) { + (void) kidmap_getuidbysid(crgetzone(cr), domain, + FUID_RID(fuid), &id); + } else { + (void) kidmap_getgidbysid(crgetzone(cr), domain, + FUID_RID(fuid), &id); + } + return (id); +} + +/* + * Add a FUID node to the list of fuid's being created for this + * ACL + * + * If ACL has multiple domains, then keep only one copy of each unique + * domain. + */ +static void +zfs_fuid_node_add(zfs_fuid_info_t **fuidpp, const char *domain, uint32_t rid, + uint64_t idx, uint64_t id, zfs_fuid_type_t type) +{ + zfs_fuid_t *fuid; + zfs_fuid_domain_t *fuid_domain; + zfs_fuid_info_t *fuidp; + uint64_t fuididx; + boolean_t found = B_FALSE; + + if (*fuidpp == NULL) + *fuidpp = zfs_fuid_info_alloc(); + + fuidp = *fuidpp; + /* + * First find fuid domain index in linked list + * + * If one isn't found then create an entry. + */ + + for (fuididx = 1, fuid_domain = list_head(&fuidp->z_domains); + fuid_domain; fuid_domain = list_next(&fuidp->z_domains, + fuid_domain), fuididx++) { + if (idx == fuid_domain->z_domidx) { + found = B_TRUE; + break; + } + } + + if (!found) { + fuid_domain = kmem_alloc(sizeof (zfs_fuid_domain_t), KM_SLEEP); + fuid_domain->z_domain = domain; + fuid_domain->z_domidx = idx; + list_insert_tail(&fuidp->z_domains, fuid_domain); + fuidp->z_domain_str_sz += strlen(domain) + 1; + fuidp->z_domain_cnt++; + } + + if (type == ZFS_ACE_USER || type == ZFS_ACE_GROUP) { + /* + * Now allocate fuid entry and add it on the end of the list + */ + + fuid = kmem_alloc(sizeof (zfs_fuid_t), KM_SLEEP); + fuid->z_id = id; + fuid->z_domidx = idx; + fuid->z_logfuid = FUID_ENCODE(fuididx, rid); + + list_insert_tail(&fuidp->z_fuids, fuid); + fuidp->z_fuid_cnt++; + } else { + if (type == ZFS_OWNER) + fuidp->z_fuid_owner = FUID_ENCODE(fuididx, rid); + else + fuidp->z_fuid_group = FUID_ENCODE(fuididx, rid); + } +} + +/* + * Create a file system FUID, based on information in the users cred + */ +uint64_t +zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type, + dmu_tx_t *tx, cred_t *cr, zfs_fuid_info_t **fuidp) +{ + uint64_t idx; + ksid_t *ksid; + uint32_t rid; + char *kdomain; + const char *domain; + uid_t id; + + VERIFY(type == ZFS_OWNER || type == ZFS_GROUP); + + if (type == ZFS_OWNER) + id = crgetuid(cr); + else + id = crgetgid(cr); + + if (!zfsvfs->z_use_fuids || !IS_EPHEMERAL(id)) + return ((uint64_t)id); + + ksid = crgetsid(cr, (type == ZFS_OWNER) ? KSID_OWNER : KSID_GROUP); + + VERIFY(ksid != NULL); + rid = ksid_getrid(ksid); + domain = ksid_getdomain(ksid); + + idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, tx); + + zfs_fuid_node_add(fuidp, kdomain, rid, idx, id, type); + + return (FUID_ENCODE(idx, rid)); +} + +/* + * Create a file system FUID for an ACL ace + * or a chown/chgrp of the file. + * This is similar to zfs_fuid_create_cred, except that + * we can't find the domain + rid information in the + * cred. Instead we have to query Winchester for the + * domain and rid. + * + * During replay operations the domain+rid information is + * found in the zfs_fuid_info_t that the replay code has + * attached to the zfsvfs of the file system. + */ +uint64_t +zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr, + zfs_fuid_type_t type, dmu_tx_t *tx, zfs_fuid_info_t **fuidpp) +{ + const char *domain; + char *kdomain; + uint32_t fuid_idx = FUID_INDEX(id); + uint32_t rid; + idmap_stat status; + uint64_t idx; + boolean_t is_replay = (zfsvfs->z_assign >= TXG_INITIAL); + zfs_fuid_t *zfuid = NULL; + zfs_fuid_info_t *fuidp; + + /* + * If POSIX ID, or entry is already a FUID then + * just return the id + * + * We may also be handed an already FUID'ized id via + * chmod. + */ + + if (!zfsvfs->z_use_fuids || !IS_EPHEMERAL(id) || fuid_idx != 0) + return (id); + + if (is_replay) { + fuidp = zfsvfs->z_fuid_replay; + + /* + * If we are passed an ephemeral id, but no + * fuid_info was logged then return NOBODY. + * This is most likely a result of idmap service + * not being available. + */ + if (fuidp == NULL) + return (UID_NOBODY); + + switch (type) { + case ZFS_ACE_USER: + case ZFS_ACE_GROUP: + zfuid = list_head(&fuidp->z_fuids); + rid = FUID_RID(zfuid->z_logfuid); + idx = FUID_INDEX(zfuid->z_logfuid); + break; + case ZFS_OWNER: + rid = FUID_RID(fuidp->z_fuid_owner); + idx = FUID_INDEX(fuidp->z_fuid_owner); + break; + case ZFS_GROUP: + rid = FUID_RID(fuidp->z_fuid_group); + idx = FUID_INDEX(fuidp->z_fuid_group); + break; + }; + domain = fuidp->z_domain_table[idx -1]; + } else { + if (type == ZFS_OWNER || type == ZFS_ACE_USER) + status = kidmap_getsidbyuid(crgetzone(cr), id, + &domain, &rid); + else + status = kidmap_getsidbygid(crgetzone(cr), id, + &domain, &rid); + + if (status != 0) { + /* + * When returning nobody we will need to + * make a dummy fuid table entry for logging + * purposes. + */ + rid = UID_NOBODY; + domain = ""; + } + } + + idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, tx); + + if (!is_replay) + zfs_fuid_node_add(fuidpp, kdomain, rid, idx, id, type); + else if (zfuid != NULL) { + list_remove(&fuidp->z_fuids, zfuid); + kmem_free(zfuid, sizeof (zfs_fuid_t)); + } + return (FUID_ENCODE(idx, rid)); +} + +void +zfs_fuid_destroy(zfsvfs_t *zfsvfs) +{ + rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER); + if (!zfsvfs->z_fuid_loaded) { + rw_exit(&zfsvfs->z_fuid_lock); + return; + } + zfs_fuid_table_destroy(&zfsvfs->z_fuid_idx, &zfsvfs->z_fuid_domain); + rw_exit(&zfsvfs->z_fuid_lock); +} + +/* + * Allocate zfs_fuid_info for tracking FUIDs created during + * zfs_mknode, VOP_SETATTR() or VOP_SETSECATTR() + */ +zfs_fuid_info_t * +zfs_fuid_info_alloc(void) +{ + zfs_fuid_info_t *fuidp; + + fuidp = kmem_zalloc(sizeof (zfs_fuid_info_t), KM_SLEEP); + list_create(&fuidp->z_domains, sizeof (zfs_fuid_domain_t), + offsetof(zfs_fuid_domain_t, z_next)); + list_create(&fuidp->z_fuids, sizeof (zfs_fuid_t), + offsetof(zfs_fuid_t, z_next)); + return (fuidp); +} + +/* + * Release all memory associated with zfs_fuid_info_t + */ +void +zfs_fuid_info_free(zfs_fuid_info_t *fuidp) +{ + zfs_fuid_t *zfuid; + zfs_fuid_domain_t *zdomain; + + while ((zfuid = list_head(&fuidp->z_fuids)) != NULL) { + list_remove(&fuidp->z_fuids, zfuid); + kmem_free(zfuid, sizeof (zfs_fuid_t)); + } + + if (fuidp->z_domain_table != NULL) + kmem_free(fuidp->z_domain_table, + (sizeof (char **)) * fuidp->z_domain_cnt); + + while ((zdomain = list_head(&fuidp->z_domains)) != NULL) { + list_remove(&fuidp->z_domains, zdomain); + kmem_free(zdomain, sizeof (zfs_fuid_domain_t)); + } + + kmem_free(fuidp, sizeof (zfs_fuid_info_t)); +} + +/* + * Check to see if id is a groupmember. If cred + * has ksid info then sidlist is checked first + * and if still not found then POSIX groups are checked + * + * Will use a straight FUID compare when possible. + */ +boolean_t +zfs_groupmember(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr) +{ + ksid_t *ksid = crgetsid(cr, KSID_GROUP); + uid_t gid; + + if (ksid) { + int i; + ksid_t *ksid_groups; + ksidlist_t *ksidlist = crgetsidlist(cr); + uint32_t idx = FUID_INDEX(id); + uint32_t rid = FUID_RID(id); + + ASSERT(ksidlist); + ksid_groups = ksidlist->ksl_sids; + + for (i = 0; i != ksidlist->ksl_nsid; i++) { + if (idx == 0) { + if (id != IDMAP_WK_CREATOR_GROUP_GID && + id == ksid_groups[i].ks_id) { + return (B_TRUE); + } + } else { + char *domain; + + domain = zfs_fuid_find_by_idx(zfsvfs, idx); + ASSERT(domain != NULL); + + if (strcmp(domain, + IDMAP_WK_CREATOR_SID_AUTHORITY) == 0) + return (B_FALSE); + + if ((strcmp(domain, + ksid_groups[i].ks_domain->kd_name) == 0) && + rid == ksid_groups[i].ks_rid) + return (B_TRUE); + } + } + } + + /* + * Not found in ksidlist, check posix groups + */ + gid = zfs_fuid_map_id(zfsvfs, id, cr, ZFS_GROUP); + return (groupmember(gid, cr)); +} +#endif diff --git a/zfs/lib/libdmu-ctl/zfs_ioctl.c b/zfs/lib/libdmu-ctl/zfs_ioctl.c new file mode 100644 index 0000000000..e4d2534746 --- /dev/null +++ b/zfs/lib/libdmu-ctl/zfs_ioctl.c @@ -0,0 +1,3055 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "@(#)zfs_ioctl.c 1.61 08/04/27 SMI" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zfs_namecheck.h" +#include "zfs_prop.h" +#include "zfs_deleg.h" + +extern struct modlfs zfs_modlfs; + +extern void zfs_init(void); +extern void zfs_fini(void); + +ldi_ident_t zfs_li = NULL; +dev_info_t *zfs_dip; + +typedef int zfs_ioc_func_t(zfs_cmd_t *); +typedef int zfs_secpolicy_func_t(zfs_cmd_t *, cred_t *); + +typedef struct zfs_ioc_vec { + zfs_ioc_func_t *zvec_func; + zfs_secpolicy_func_t *zvec_secpolicy; + enum { + NO_NAME, + POOL_NAME, + DATASET_NAME + } zvec_namecheck; + boolean_t zvec_his_log; +} zfs_ioc_vec_t; + +/* _NOTE(PRINTFLIKE(4)) - this is printf-like, but lint is too whiney */ +void +__dprintf(const char *file, const char *func, int line, const char *fmt, ...) +{ + const char *newfile; + char buf[256]; + va_list adx; + + /* + * Get rid of annoying "../common/" prefix to filename. + */ + newfile = strrchr(file, '/'); + if (newfile != NULL) { + newfile = newfile + 1; /* Get rid of leading / */ + } else { + newfile = file; + } + + va_start(adx, fmt); + (void) vsnprintf(buf, sizeof (buf), fmt, adx); + va_end(adx); + + /* + * To get this data, use the zfs-dprintf probe as so: + * dtrace -q -n 'zfs-dprintf \ + * /stringof(arg0) == "dbuf.c"/ \ + * {printf("%s: %s", stringof(arg1), stringof(arg3))}' + * arg0 = file name + * arg1 = function name + * arg2 = line number + * arg3 = message + */ + DTRACE_PROBE4(zfs__dprintf, + char *, newfile, char *, func, int, line, char *, buf); +} + +static void +history_str_free(char *buf) +{ + kmem_free(buf, HIS_MAX_RECORD_LEN); +} + +static char * +history_str_get(zfs_cmd_t *zc) +{ + char *buf; + + if (zc->zc_history == NULL) + return (NULL); + + buf = kmem_alloc(HIS_MAX_RECORD_LEN, KM_SLEEP); + if (copyinstr((void *)(uintptr_t)zc->zc_history, + buf, HIS_MAX_RECORD_LEN, NULL) != 0) { + history_str_free(buf); + return (NULL); + } + + buf[HIS_MAX_RECORD_LEN -1] = '\0'; + + return (buf); +} + +/* + * zfs_check_version + * + * Return non-zero if the spa version is less than requested version. + */ +static int +zfs_check_version(const char *name, int version) +{ + + spa_t *spa; + + if (spa_open(name, &spa, FTAG) == 0) { + if (spa_version(spa) < version) { + spa_close(spa, FTAG); + return (1); + } + spa_close(spa, FTAG); + } + return (0); +} + +/* + * zpl_check_version + * + * Return non-zero if the ZPL version is less than requested version. + */ +static int +zpl_check_version(const char *name, int version) +{ + objset_t *os; + int rc = 1; + + if (dmu_objset_open(name, DMU_OST_ANY, + DS_MODE_STANDARD | DS_MODE_READONLY, &os) == 0) { + uint64_t propversion; + + if (zfs_get_zplprop(os, ZFS_PROP_VERSION, + &propversion) == 0) { + rc = !(propversion >= version); + } + dmu_objset_close(os); + } + return (rc); +} + +static void +zfs_log_history(zfs_cmd_t *zc) +{ + spa_t *spa; + char *buf; + + if ((buf = history_str_get(zc)) == NULL) + return; + + if (spa_open(zc->zc_name, &spa, FTAG) == 0) { + if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY) + (void) spa_history_log(spa, buf, LOG_CMD_NORMAL); + spa_close(spa, FTAG); + } + history_str_free(buf); +} + +/* + * Policy for top-level read operations (list pools). Requires no privileges, + * and can be used in the local zone, as there is no associated dataset. + */ +/* ARGSUSED */ +static int +zfs_secpolicy_none(zfs_cmd_t *zc, cred_t *cr) +{ + return (0); +} + +/* + * Policy for dataset read operations (list children, get statistics). Requires + * no privileges, but must be visible in the local zone. + */ +/* ARGSUSED */ +static int +zfs_secpolicy_read(zfs_cmd_t *zc, cred_t *cr) +{ + if (INGLOBALZONE(curproc) || + zone_dataset_visible(zc->zc_name, NULL)) + return (0); + + return (ENOENT); +} + +static int +zfs_dozonecheck(const char *dataset, cred_t *cr) +{ + uint64_t zoned; + int writable = 1; + + /* + * The dataset must be visible by this zone -- check this first + * so they don't see EPERM on something they shouldn't know about. + */ + if (!INGLOBALZONE(curproc) && + !zone_dataset_visible(dataset, &writable)) + return (ENOENT); + + if (dsl_prop_get_integer(dataset, "zoned", &zoned, NULL)) + return (ENOENT); + + if (INGLOBALZONE(curproc)) { + /* + * If the fs is zoned, only root can access it from the + * global zone. + */ + if (secpolicy_zfs(cr) && zoned) + return (EPERM); + } else { + /* + * If we are in a local zone, the 'zoned' property must be set. + */ + if (!zoned) + return (EPERM); + + /* must be writable by this zone */ + if (!writable) + return (EPERM); + } + return (0); +} + +int +zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr) +{ + int error; + + error = zfs_dozonecheck(name, cr); + if (error == 0) { + error = secpolicy_zfs(cr); + if (error) + error = dsl_deleg_access(name, perm, cr); + } + return (error); +} + +static int +zfs_secpolicy_setprop(const char *name, zfs_prop_t prop, cred_t *cr) +{ + /* + * Check permissions for special properties. + */ + switch (prop) { + case ZFS_PROP_ZONED: + /* + * Disallow setting of 'zoned' from within a local zone. + */ + if (!INGLOBALZONE(curproc)) + return (EPERM); + break; + + case ZFS_PROP_QUOTA: + if (!INGLOBALZONE(curproc)) { + uint64_t zoned; + char setpoint[MAXNAMELEN]; + /* + * Unprivileged users are allowed to modify the + * quota on things *under* (ie. contained by) + * the thing they own. + */ + if (dsl_prop_get_integer(name, "zoned", &zoned, + setpoint)) + return (EPERM); + if (!zoned || strlen(name) <= strlen(setpoint)) + return (EPERM); + } + break; + } + + return (zfs_secpolicy_write_perms(name, zfs_prop_to_name(prop), cr)); +} + +int +zfs_secpolicy_fsacl(zfs_cmd_t *zc, cred_t *cr) +{ + int error; + + error = zfs_dozonecheck(zc->zc_name, cr); + if (error) + return (error); + + /* + * permission to set permissions will be evaluated later in + * dsl_deleg_can_allow() + */ + return (0); +} + +int +zfs_secpolicy_rollback(zfs_cmd_t *zc, cred_t *cr) +{ + int error; + error = zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_ROLLBACK, cr); + if (error == 0) + error = zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_MOUNT, cr); + return (error); +} + +int +zfs_secpolicy_send(zfs_cmd_t *zc, cred_t *cr) +{ + return (zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_SEND, cr)); +} + +int +zfs_secpolicy_share(zfs_cmd_t *zc, cred_t *cr) +{ + if (!INGLOBALZONE(curproc)) + return (EPERM); + + if (secpolicy_nfs(cr) == 0) { + return (0); + } else { + vnode_t *vp; + int error; + + if ((error = lookupname(zc->zc_value, UIO_SYSSPACE, + NO_FOLLOW, NULL, &vp)) != 0) + return (error); + + /* Now make sure mntpnt and dataset are ZFS */ + + if (vp->v_vfsp->vfs_fstype != zfsfstype || + (strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource), + zc->zc_name) != 0)) { + VN_RELE(vp); + return (EPERM); + } + + VN_RELE(vp); + return (dsl_deleg_access(zc->zc_name, + ZFS_DELEG_PERM_SHARE, cr)); + } +} + +static int +zfs_get_parent(const char *datasetname, char *parent, int parentsize) +{ + char *cp; + + /* + * Remove the @bla or /bla from the end of the name to get the parent. + */ + (void) strncpy(parent, datasetname, parentsize); + cp = strrchr(parent, '@'); + if (cp != NULL) { + cp[0] = '\0'; + } else { + cp = strrchr(parent, '/'); + if (cp == NULL) + return (ENOENT); + cp[0] = '\0'; + } + + return (0); +} + +int +zfs_secpolicy_destroy_perms(const char *name, cred_t *cr) +{ + int error; + + if ((error = zfs_secpolicy_write_perms(name, + ZFS_DELEG_PERM_MOUNT, cr)) != 0) + return (error); + + return (zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_DESTROY, cr)); +} + +static int +zfs_secpolicy_destroy(zfs_cmd_t *zc, cred_t *cr) +{ + return (zfs_secpolicy_destroy_perms(zc->zc_name, cr)); +} + +/* + * Must have sys_config privilege to check the iscsi permission + */ +/* ARGSUSED */ +static int +zfs_secpolicy_iscsi(zfs_cmd_t *zc, cred_t *cr) +{ + return (secpolicy_zfs(cr)); +} + +int +zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr) +{ + char parentname[MAXNAMELEN]; + int error; + + if ((error = zfs_secpolicy_write_perms(from, + ZFS_DELEG_PERM_RENAME, cr)) != 0) + return (error); + + if ((error = zfs_secpolicy_write_perms(from, + ZFS_DELEG_PERM_MOUNT, cr)) != 0) + return (error); + + if ((error = zfs_get_parent(to, parentname, + sizeof (parentname))) != 0) + return (error); + + if ((error = zfs_secpolicy_write_perms(parentname, + ZFS_DELEG_PERM_CREATE, cr)) != 0) + return (error); + + if ((error = zfs_secpolicy_write_perms(parentname, + ZFS_DELEG_PERM_MOUNT, cr)) != 0) + return (error); + + return (error); +} + +static int +zfs_secpolicy_rename(zfs_cmd_t *zc, cred_t *cr) +{ + return (zfs_secpolicy_rename_perms(zc->zc_name, zc->zc_value, cr)); +} + +static int +zfs_secpolicy_promote(zfs_cmd_t *zc, cred_t *cr) +{ + char parentname[MAXNAMELEN]; + objset_t *clone; + int error; + + error = zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_PROMOTE, cr); + if (error) + return (error); + + error = dmu_objset_open(zc->zc_name, DMU_OST_ANY, + DS_MODE_STANDARD | DS_MODE_READONLY, &clone); + + if (error == 0) { + dsl_dataset_t *pclone = NULL; + dsl_dir_t *dd; + dd = clone->os->os_dsl_dataset->ds_dir; + + rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); + error = dsl_dataset_open_obj(dd->dd_pool, + dd->dd_phys->dd_origin_obj, NULL, + DS_MODE_NONE, FTAG, &pclone); + rw_exit(&dd->dd_pool->dp_config_rwlock); + if (error) { + dmu_objset_close(clone); + return (error); + } + + error = zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_MOUNT, cr); + + dsl_dataset_name(pclone, parentname); + dmu_objset_close(clone); + dsl_dataset_close(pclone, DS_MODE_NONE, FTAG); + if (error == 0) + error = zfs_secpolicy_write_perms(parentname, + ZFS_DELEG_PERM_PROMOTE, cr); + } + return (error); +} + +static int +zfs_secpolicy_receive(zfs_cmd_t *zc, cred_t *cr) +{ + int error; + + if ((error = zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_RECEIVE, cr)) != 0) + return (error); + + if ((error = zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_MOUNT, cr)) != 0) + return (error); + + return (zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_CREATE, cr)); +} + +int +zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr) +{ + int error; + + if ((error = zfs_secpolicy_write_perms(name, + ZFS_DELEG_PERM_SNAPSHOT, cr)) != 0) + return (error); + + error = zfs_secpolicy_write_perms(name, + ZFS_DELEG_PERM_MOUNT, cr); + + return (error); +} + +static int +zfs_secpolicy_snapshot(zfs_cmd_t *zc, cred_t *cr) +{ + + return (zfs_secpolicy_snapshot_perms(zc->zc_name, cr)); +} + +static int +zfs_secpolicy_create(zfs_cmd_t *zc, cred_t *cr) +{ + char parentname[MAXNAMELEN]; + int error; + + if ((error = zfs_get_parent(zc->zc_name, parentname, + sizeof (parentname))) != 0) + return (error); + + if (zc->zc_value[0] != '\0') { + if ((error = zfs_secpolicy_write_perms(zc->zc_value, + ZFS_DELEG_PERM_CLONE, cr)) != 0) + return (error); + } + + if ((error = zfs_secpolicy_write_perms(parentname, + ZFS_DELEG_PERM_CREATE, cr)) != 0) + return (error); + + error = zfs_secpolicy_write_perms(parentname, + ZFS_DELEG_PERM_MOUNT, cr); + + return (error); +} + +static int +zfs_secpolicy_umount(zfs_cmd_t *zc, cred_t *cr) +{ + int error; + + error = secpolicy_fs_unmount(cr, NULL); + if (error) { + error = dsl_deleg_access(zc->zc_name, ZFS_DELEG_PERM_MOUNT, cr); + } + return (error); +} + +/* + * Policy for pool operations - create/destroy pools, add vdevs, etc. Requires + * SYS_CONFIG privilege, which is not available in a local zone. + */ +/* ARGSUSED */ +static int +zfs_secpolicy_config(zfs_cmd_t *zc, cred_t *cr) +{ + if (secpolicy_sys_config(cr, B_FALSE) != 0) + return (EPERM); + + return (0); +} + +/* + * Just like zfs_secpolicy_config, except that we will check for + * mount permission on the dataset for permission to create/remove + * the minor nodes. + */ +static int +zfs_secpolicy_minor(zfs_cmd_t *zc, cred_t *cr) +{ + if (secpolicy_sys_config(cr, B_FALSE) != 0) { + return (dsl_deleg_access(zc->zc_name, + ZFS_DELEG_PERM_MOUNT, cr)); + } + + return (0); +} + +/* + * Policy for fault injection. Requires all privileges. + */ +/* ARGSUSED */ +static int +zfs_secpolicy_inject(zfs_cmd_t *zc, cred_t *cr) +{ + return (secpolicy_zinject(cr)); +} + +static int +zfs_secpolicy_inherit(zfs_cmd_t *zc, cred_t *cr) +{ + zfs_prop_t prop = zfs_name_to_prop(zc->zc_value); + + if (prop == ZPROP_INVAL) { + if (!zfs_prop_user(zc->zc_value)) + return (EINVAL); + return (zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_USERPROP, cr)); + } else { + if (!zfs_prop_inheritable(prop)) + return (EINVAL); + return (zfs_secpolicy_setprop(zc->zc_name, prop, cr)); + } +} + +/* + * Returns the nvlist as specified by the user in the zfs_cmd_t. + */ +static int +get_nvlist(uint64_t nvl, uint64_t size, nvlist_t **nvp) +{ + char *packed; + int error; + nvlist_t *list = NULL; + + /* + * Read in and unpack the user-supplied nvlist. + */ + if (size == 0) + return (EINVAL); + + packed = kmem_alloc(size, KM_SLEEP); + + if ((error = xcopyin((void *)(uintptr_t)nvl, packed, size)) != 0) { + kmem_free(packed, size); + return (error); + } + + if ((error = nvlist_unpack(packed, size, &list, 0)) != 0) { + kmem_free(packed, size); + return (error); + } + + kmem_free(packed, size); + + *nvp = list; + return (0); +} + +static int +put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl) +{ + char *packed = NULL; + size_t size; + int error; + + VERIFY(nvlist_size(nvl, &size, NV_ENCODE_NATIVE) == 0); + + if (size > zc->zc_nvlist_dst_size) { + error = ENOMEM; + } else { + packed = kmem_alloc(size, KM_SLEEP); + VERIFY(nvlist_pack(nvl, &packed, &size, NV_ENCODE_NATIVE, + KM_SLEEP) == 0); + error = xcopyout(packed, (void *)(uintptr_t)zc->zc_nvlist_dst, + size); + kmem_free(packed, size); + } + + zc->zc_nvlist_dst_size = size; + return (error); +} + +static int +zfs_ioc_pool_create(zfs_cmd_t *zc) +{ + int error; + nvlist_t *config, *props = NULL; + char *buf; + + if (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, + &config)) + return (error); + + if (zc->zc_nvlist_src_size != 0 && (error = + get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, &props))) { + nvlist_free(config); + return (error); + } + + buf = history_str_get(zc); + + error = spa_create(zc->zc_name, config, props, buf); + + if (buf != NULL) + history_str_free(buf); + + nvlist_free(config); + + if (props) + nvlist_free(props); + + return (error); +} + +static int +zfs_ioc_pool_destroy(zfs_cmd_t *zc) +{ + int error; + zfs_log_history(zc); + error = spa_destroy(zc->zc_name); + return (error); +} + +static int +zfs_ioc_pool_import(zfs_cmd_t *zc) +{ + int error; + nvlist_t *config, *props = NULL; + uint64_t guid; + + if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, + &config)) != 0) + return (error); + + if (zc->zc_nvlist_src_size != 0 && (error = + get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, &props))) { + nvlist_free(config); + return (error); + } + + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 || + guid != zc->zc_guid) + error = EINVAL; + else + error = spa_import(zc->zc_name, config, props); + + nvlist_free(config); + + if (props) + nvlist_free(props); + + return (error); +} + +static int +zfs_ioc_pool_export(zfs_cmd_t *zc) +{ + int error; + zfs_log_history(zc); + error = spa_export(zc->zc_name, NULL); + return (error); +} + +static int +zfs_ioc_pool_configs(zfs_cmd_t *zc) +{ + nvlist_t *configs; + int error; + + if ((configs = spa_all_configs(&zc->zc_cookie)) == NULL) + return (EEXIST); + + error = put_nvlist(zc, configs); + + nvlist_free(configs); + + return (error); +} + +static int +zfs_ioc_pool_stats(zfs_cmd_t *zc) +{ + nvlist_t *config; + int error; + int ret = 0; + + error = spa_get_stats(zc->zc_name, &config, zc->zc_value, + sizeof (zc->zc_value)); + + if (config != NULL) { + ret = put_nvlist(zc, config); + nvlist_free(config); + + /* + * The config may be present even if 'error' is non-zero. + * In this case we return success, and preserve the real errno + * in 'zc_cookie'. + */ + zc->zc_cookie = error; + } else { + ret = error; + } + + return (ret); +} + +/* + * Try to import the given pool, returning pool stats as appropriate so that + * user land knows which devices are available and overall pool health. + */ +static int +zfs_ioc_pool_tryimport(zfs_cmd_t *zc) +{ + nvlist_t *tryconfig, *config; + int error; + + if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, + &tryconfig)) != 0) + return (error); + + config = spa_tryimport(tryconfig); + + nvlist_free(tryconfig); + + if (config == NULL) + return (EINVAL); + + error = put_nvlist(zc, config); + nvlist_free(config); + + return (error); +} + +static int +zfs_ioc_pool_scrub(zfs_cmd_t *zc) +{ + spa_t *spa; + int error; + + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) + return (error); + + mutex_enter(&spa_namespace_lock); + error = spa_scrub(spa, zc->zc_cookie, B_FALSE); + mutex_exit(&spa_namespace_lock); + + spa_close(spa, FTAG); + + return (error); +} + +static int +zfs_ioc_pool_freeze(zfs_cmd_t *zc) +{ + spa_t *spa; + int error; + + error = spa_open(zc->zc_name, &spa, FTAG); + if (error == 0) { + spa_freeze(spa); + spa_close(spa, FTAG); + } + return (error); +} + +static int +zfs_ioc_pool_upgrade(zfs_cmd_t *zc) +{ + spa_t *spa; + int error; + + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) + return (error); + + if (zc->zc_cookie < spa_version(spa) || zc->zc_cookie > SPA_VERSION) { + spa_close(spa, FTAG); + return (EINVAL); + } + + spa_upgrade(spa, zc->zc_cookie); + spa_close(spa, FTAG); + + return (error); +} + +static int +zfs_ioc_pool_get_history(zfs_cmd_t *zc) +{ + spa_t *spa; + char *hist_buf; + uint64_t size; + int error; + + if ((size = zc->zc_history_len) == 0) + return (EINVAL); + + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) + return (error); + + if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) { + spa_close(spa, FTAG); + return (ENOTSUP); + } + + hist_buf = kmem_alloc(size, KM_SLEEP); + if ((error = spa_history_get(spa, &zc->zc_history_offset, + &zc->zc_history_len, hist_buf)) == 0) { + error = xcopyout(hist_buf, + (char *)(uintptr_t)zc->zc_history, + zc->zc_history_len); + } + + spa_close(spa, FTAG); + kmem_free(hist_buf, size); + return (error); +} + +static int +zfs_ioc_dsobj_to_dsname(zfs_cmd_t *zc) +{ + int error; + + if (error = dsl_dsobj_to_dsname(zc->zc_name, zc->zc_obj, zc->zc_value)) + return (error); + + return (0); +} + +static int +zfs_ioc_obj_to_path(zfs_cmd_t *zc) +{ + objset_t *osp; + int error; + + if ((error = dmu_objset_open(zc->zc_name, DMU_OST_ZFS, + DS_MODE_NONE | DS_MODE_READONLY, &osp)) != 0) + return (error); + + error = zfs_obj_to_path(osp, zc->zc_obj, zc->zc_value, + sizeof (zc->zc_value)); + dmu_objset_close(osp); + + return (error); +} + +static int +zfs_ioc_vdev_add(zfs_cmd_t *zc) +{ + spa_t *spa; + int error; + nvlist_t *config, **l2cache, **spares; + uint_t nl2cache = 0, nspares = 0; + + error = spa_open(zc->zc_name, &spa, FTAG); + if (error != 0) + return (error); + + error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, + &config); + (void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_L2CACHE, + &l2cache, &nl2cache); + + (void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_SPARES, + &spares, &nspares); + + /* + * A root pool with concatenated devices is not supported. + * Thus, can not add a device to a root pool. + * + * Intent log device can not be added to a rootpool because + * during mountroot, zil is replayed, a seperated log device + * can not be accessed during the mountroot time. + * + * l2cache and spare devices are ok to be added to a rootpool. + */ + if (spa->spa_bootfs != 0 && nl2cache == 0 && nspares == 0) { + spa_close(spa, FTAG); + return (EDOM); + } + + if (error == 0) { + error = spa_vdev_add(spa, config); + nvlist_free(config); + } + spa_close(spa, FTAG); + return (error); +} + +static int +zfs_ioc_vdev_remove(zfs_cmd_t *zc) +{ + spa_t *spa; + int error; + + error = spa_open(zc->zc_name, &spa, FTAG); + if (error != 0) + return (error); + error = spa_vdev_remove(spa, zc->zc_guid, B_FALSE); + spa_close(spa, FTAG); + return (error); +} + +static int +zfs_ioc_vdev_set_state(zfs_cmd_t *zc) +{ + spa_t *spa; + int error; + vdev_state_t newstate = VDEV_STATE_UNKNOWN; + + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) + return (error); + switch (zc->zc_cookie) { + case VDEV_STATE_ONLINE: + error = vdev_online(spa, zc->zc_guid, zc->zc_obj, &newstate); + break; + + case VDEV_STATE_OFFLINE: + error = vdev_offline(spa, zc->zc_guid, zc->zc_obj); + break; + + case VDEV_STATE_FAULTED: + error = vdev_fault(spa, zc->zc_guid); + break; + + case VDEV_STATE_DEGRADED: + error = vdev_degrade(spa, zc->zc_guid); + break; + + default: + error = EINVAL; + } + zc->zc_cookie = newstate; + spa_close(spa, FTAG); + return (error); +} + +static int +zfs_ioc_vdev_attach(zfs_cmd_t *zc) +{ + spa_t *spa; + int replacing = zc->zc_cookie; + nvlist_t *config; + int error; + + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) + return (error); + + if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, + &config)) == 0) { + error = spa_vdev_attach(spa, zc->zc_guid, config, replacing); + nvlist_free(config); + } + + spa_close(spa, FTAG); + return (error); +} + +static int +zfs_ioc_vdev_detach(zfs_cmd_t *zc) +{ + spa_t *spa; + int error; + + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) + return (error); + + error = spa_vdev_detach(spa, zc->zc_guid, B_FALSE); + + spa_close(spa, FTAG); + return (error); +} + +static int +zfs_ioc_vdev_setpath(zfs_cmd_t *zc) +{ + spa_t *spa; + char *path = zc->zc_value; + uint64_t guid = zc->zc_guid; + int error; + + error = spa_open(zc->zc_name, &spa, FTAG); + if (error != 0) + return (error); + + error = spa_vdev_setpath(spa, guid, path); + spa_close(spa, FTAG); + return (error); +} + +static int +zfs_os_open_retry(char *name, objset_t **os) +{ + int error; + +retry: + error = dmu_objset_open(name, DMU_OST_ANY, + DS_MODE_STANDARD | DS_MODE_READONLY, os); + if (error != 0) { + /* + * This is ugly: dmu_objset_open() can return EBUSY if + * the objset is held exclusively. Fortunately this hold is + * only for a short while, so we retry here. + * This avoids user code having to handle EBUSY, + * for example for a "zfs list". + */ + if (error == EBUSY) { + delay(1); + goto retry; + } + } + return (error); +} + +/* + * inputs: + * zc_name name of filesystem + * zc_nvlist_dst_size size of buffer for property nvlist + * + * outputs: + * zc_objset_stats stats + * zc_nvlist_dst property nvlist + * zc_nvlist_dst_size size of property nvlist + * zc_value alternate root + */ +static int +zfs_ioc_objset_stats(zfs_cmd_t *zc) +{ + objset_t *os = NULL; + int error; + nvlist_t *nv; + + if ((error = zfs_os_open_retry(zc->zc_name, &os)) != 0) + return (error); + + dmu_objset_fast_stat(os, &zc->zc_objset_stats); + + if (zc->zc_nvlist_dst != 0 && + (error = dsl_prop_get_all(os, &nv)) == 0) { + dmu_objset_stats(os, nv); + /* + * NB: zvol_get_stats() will read the objset contents, + * which we aren't supposed to do with a + * DS_MODE_STANDARD open, because it could be + * inconsistent. So this is a bit of a workaround... + */ + if (!zc->zc_objset_stats.dds_inconsistent) { + if (dmu_objset_type(os) == DMU_OST_ZVOL) + VERIFY(zvol_get_stats(os, nv) == 0); + } + error = put_nvlist(zc, nv); + nvlist_free(nv); + } + + spa_altroot(dmu_objset_spa(os), zc->zc_value, sizeof (zc->zc_value)); + + dmu_objset_close(os); + return (error); +} + +static int +nvl_add_zplprop(objset_t *os, nvlist_t *props, zfs_prop_t prop) +{ + uint64_t value; + int error; + + /* + * zfs_get_zplprop() will either find a value or give us + * the default value (if there is one). + */ + if ((error = zfs_get_zplprop(os, prop, &value)) != 0) + return (error); + VERIFY(nvlist_add_uint64(props, zfs_prop_to_name(prop), value) == 0); + return (0); +} + +/* + * inputs: + * zc_name name of filesystem + * zc_nvlist_dst_size size of buffer for zpl property nvlist + * + * outputs: + * zc_nvlist_dst zpl property nvlist + * zc_nvlist_dst_size size of zpl property nvlist + */ +static int +zfs_ioc_objset_zplprops(zfs_cmd_t *zc) +{ + objset_t *os; + int err; + + if ((err = zfs_os_open_retry(zc->zc_name, &os)) != 0) + return (err); + + dmu_objset_fast_stat(os, &zc->zc_objset_stats); + + /* + * NB: nvl_add_zplprop() will read the objset contents, + * which we aren't supposed to do with a DS_MODE_STANDARD + * open, because it could be inconsistent. + */ + if (zc->zc_nvlist_dst != NULL && + !zc->zc_objset_stats.dds_inconsistent && + dmu_objset_type(os) == DMU_OST_ZFS) { + nvlist_t *nv; + + VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); + if ((err = nvl_add_zplprop(os, nv, ZFS_PROP_VERSION)) == 0 && + (err = nvl_add_zplprop(os, nv, ZFS_PROP_NORMALIZE)) == 0 && + (err = nvl_add_zplprop(os, nv, ZFS_PROP_UTF8ONLY)) == 0 && + (err = nvl_add_zplprop(os, nv, ZFS_PROP_CASE)) == 0) + err = put_nvlist(zc, nv); + nvlist_free(nv); + } else { + err = ENOENT; + } + dmu_objset_close(os); + return (err); +} + +/* + * inputs: + * zc_name name of filesystem + * zc_cookie zap cursor + * zc_nvlist_dst_size size of buffer for property nvlist + * + * outputs: + * zc_name name of next filesystem + * zc_objset_stats stats + * zc_nvlist_dst property nvlist + * zc_nvlist_dst_size size of property nvlist + * zc_value alternate root + */ +static int +zfs_ioc_dataset_list_next(zfs_cmd_t *zc) +{ + objset_t *os; + int error; + char *p; + + if ((error = zfs_os_open_retry(zc->zc_name, &os)) != 0) { + if (error == ENOENT) + error = ESRCH; + return (error); + } + + p = strrchr(zc->zc_name, '/'); + if (p == NULL || p[1] != '\0') + (void) strlcat(zc->zc_name, "/", sizeof (zc->zc_name)); + p = zc->zc_name + strlen(zc->zc_name); + + do { + error = dmu_dir_list_next(os, + sizeof (zc->zc_name) - (p - zc->zc_name), p, + NULL, &zc->zc_cookie); + if (error == ENOENT) + error = ESRCH; + } while (error == 0 && !INGLOBALZONE(curproc) && + !zone_dataset_visible(zc->zc_name, NULL)); + + /* + * If it's a hidden dataset (ie. with a '$' in its name), don't + * try to get stats for it. Userland will skip over it. + */ + if (error == 0 && strchr(zc->zc_name, '$') == NULL) + error = zfs_ioc_objset_stats(zc); /* fill in the stats */ + + dmu_objset_close(os); + return (error); +} + +/* + * inputs: + * zc_name name of filesystem + * zc_cookie zap cursor + * zc_nvlist_dst_size size of buffer for property nvlist + * + * outputs: + * zc_name name of next snapshot + * zc_objset_stats stats + * zc_nvlist_dst property nvlist + * zc_nvlist_dst_size size of property nvlist + * zc_value alternate root + */ +static int +zfs_ioc_snapshot_list_next(zfs_cmd_t *zc) +{ + objset_t *os; + int error; + + if ((error = zfs_os_open_retry(zc->zc_name, &os)) != 0) { + if (error == ENOENT) + error = ESRCH; + return (error); + } + + /* + * A dataset name of maximum length cannot have any snapshots, + * so exit immediately. + */ + if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >= MAXNAMELEN) { + dmu_objset_close(os); + return (ESRCH); + } + + error = dmu_snapshot_list_next(os, + sizeof (zc->zc_name) - strlen(zc->zc_name), + zc->zc_name + strlen(zc->zc_name), NULL, &zc->zc_cookie, NULL); + if (error == ENOENT) + error = ESRCH; + + if (error == 0) + error = zfs_ioc_objset_stats(zc); /* fill in the stats */ + + /* if we failed, undo the @ that we tacked on to zc_name */ + if (error != 0) + *strchr(zc->zc_name, '@') = '\0'; + + dmu_objset_close(os); + return (error); +} + +int +zfs_set_prop_nvlist(const char *name, nvlist_t *nvl) +{ + nvpair_t *elem; + int error; + uint64_t intval; + char *strval; + + /* + * First validate permission to set all of the properties + */ + elem = NULL; + while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) { + const char *propname = nvpair_name(elem); + zfs_prop_t prop = zfs_name_to_prop(propname); + + if (prop == ZPROP_INVAL) { + /* + * If this is a user-defined property, it must be a + * string, and there is no further validation to do. + */ + if (!zfs_prop_user(propname) || + nvpair_type(elem) != DATA_TYPE_STRING) + return (EINVAL); + + if (error = zfs_secpolicy_write_perms(name, + ZFS_DELEG_PERM_USERPROP, CRED())) + return (error); + continue; + } + + if ((error = zfs_secpolicy_setprop(name, prop, CRED())) != 0) + return (error); + + /* + * Check that this value is valid for this pool version + */ + switch (prop) { + case ZFS_PROP_COMPRESSION: + /* + * If the user specified gzip compression, make sure + * the SPA supports it. We ignore any errors here since + * we'll catch them later. + */ + if (nvpair_type(elem) == DATA_TYPE_UINT64 && + nvpair_value_uint64(elem, &intval) == 0 && + intval >= ZIO_COMPRESS_GZIP_1 && + intval <= ZIO_COMPRESS_GZIP_9) { + if (zfs_check_version(name, + SPA_VERSION_GZIP_COMPRESSION)) + return (ENOTSUP); + } + break; + + case ZFS_PROP_COPIES: + if (zfs_check_version(name, SPA_VERSION_DITTO_BLOCKS)) + return (ENOTSUP); + break; + + case ZFS_PROP_SHARESMB: + if (zpl_check_version(name, ZPL_VERSION_FUID)) + return (ENOTSUP); + break; + } + if ((error = zfs_secpolicy_setprop(name, prop, CRED())) != 0) + return (error); + } + + elem = NULL; + while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) { + const char *propname = nvpair_name(elem); + zfs_prop_t prop = zfs_name_to_prop(propname); + + if (prop == ZPROP_INVAL) { + VERIFY(nvpair_value_string(elem, &strval) == 0); + error = dsl_prop_set(name, propname, 1, + strlen(strval) + 1, strval); + if (error == 0) + continue; + else + return (error); + } + + switch (prop) { + case ZFS_PROP_QUOTA: + if ((error = nvpair_value_uint64(elem, &intval)) != 0 || + (error = dsl_dir_set_quota(name, intval)) != 0) + return (error); + break; + + case ZFS_PROP_REFQUOTA: + if ((error = nvpair_value_uint64(elem, &intval)) != 0 || + (error = dsl_dataset_set_quota(name, intval)) != 0) + return (error); + break; + + case ZFS_PROP_RESERVATION: + if ((error = nvpair_value_uint64(elem, &intval)) != 0 || + (error = dsl_dir_set_reservation(name, + intval)) != 0) + return (error); + break; + + case ZFS_PROP_REFRESERVATION: + if ((error = nvpair_value_uint64(elem, &intval)) != 0 || + (error = dsl_dataset_set_reservation(name, + intval)) != 0) + return (error); + break; + + case ZFS_PROP_VOLSIZE: + if ((error = nvpair_value_uint64(elem, &intval)) != 0 || + (error = zvol_set_volsize(name, + ddi_driver_major(zfs_dip), intval)) != 0) + return (error); + break; + + case ZFS_PROP_VOLBLOCKSIZE: + if ((error = nvpair_value_uint64(elem, &intval)) != 0 || + (error = zvol_set_volblocksize(name, intval)) != 0) + return (error); + break; + + case ZFS_PROP_VERSION: + if ((error = nvpair_value_uint64(elem, &intval)) != 0 || + (error = zfs_set_version(name, intval)) != 0) + return (error); + break; + + default: + if (nvpair_type(elem) == DATA_TYPE_STRING) { + if (zfs_prop_get_type(prop) != + PROP_TYPE_STRING) + return (EINVAL); + VERIFY(nvpair_value_string(elem, &strval) == 0); + if ((error = dsl_prop_set(name, + nvpair_name(elem), 1, strlen(strval) + 1, + strval)) != 0) + return (error); + } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { + const char *unused; + + VERIFY(nvpair_value_uint64(elem, &intval) == 0); + + switch (zfs_prop_get_type(prop)) { + case PROP_TYPE_NUMBER: + break; + case PROP_TYPE_STRING: + return (EINVAL); + case PROP_TYPE_INDEX: + if (zfs_prop_index_to_string(prop, + intval, &unused) != 0) + return (EINVAL); + break; + default: + cmn_err(CE_PANIC, + "unknown property type"); + break; + } + + if ((error = dsl_prop_set(name, propname, + 8, 1, &intval)) != 0) + return (error); + } else { + return (EINVAL); + } + break; + } + } + + return (0); +} + +/* + * inputs: + * zc_name name of filesystem + * zc_value name of property to inherit + * zc_nvlist_src{_size} nvlist of properties to apply + * + * outputs: none + */ +static int +zfs_ioc_set_prop(zfs_cmd_t *zc) +{ + nvlist_t *nvl; + int error; + + if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + &nvl)) != 0) + return (error); + + error = zfs_set_prop_nvlist(zc->zc_name, nvl); + + nvlist_free(nvl); + return (error); +} + +/* + * inputs: + * zc_name name of filesystem + * zc_value name of property to inherit + * + * outputs: none + */ +static int +zfs_ioc_inherit_prop(zfs_cmd_t *zc) +{ + /* the property name has been validated by zfs_secpolicy_inherit() */ + return (dsl_prop_set(zc->zc_name, zc->zc_value, 0, 0, NULL)); +} + +static int +zfs_ioc_pool_set_props(zfs_cmd_t *zc) +{ + nvlist_t *props; + spa_t *spa; + int error; + + if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + &props))) + return (error); + + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) { + nvlist_free(props); + return (error); + } + + error = spa_prop_set(spa, props); + + nvlist_free(props); + spa_close(spa, FTAG); + + return (error); +} + +static int +zfs_ioc_pool_get_props(zfs_cmd_t *zc) +{ + spa_t *spa; + int error; + nvlist_t *nvp = NULL; + + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) + return (error); + + error = spa_prop_get(spa, &nvp); + + if (error == 0 && zc->zc_nvlist_dst != NULL) + error = put_nvlist(zc, nvp); + else + error = EFAULT; + + spa_close(spa, FTAG); + + if (nvp) + nvlist_free(nvp); + return (error); +} + +static int +zfs_ioc_iscsi_perm_check(zfs_cmd_t *zc) +{ + nvlist_t *nvp; + int error; + uint32_t uid; + uint32_t gid; + uint32_t *groups; + uint_t group_cnt; + cred_t *usercred; + + if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + &nvp)) != 0) { + return (error); + } + + if ((error = nvlist_lookup_uint32(nvp, + ZFS_DELEG_PERM_UID, &uid)) != 0) { + nvlist_free(nvp); + return (EPERM); + } + + if ((error = nvlist_lookup_uint32(nvp, + ZFS_DELEG_PERM_GID, &gid)) != 0) { + nvlist_free(nvp); + return (EPERM); + } + + if ((error = nvlist_lookup_uint32_array(nvp, ZFS_DELEG_PERM_GROUPS, + &groups, &group_cnt)) != 0) { + nvlist_free(nvp); + return (EPERM); + } + usercred = cralloc(); + if ((crsetugid(usercred, uid, gid) != 0) || + (crsetgroups(usercred, group_cnt, (gid_t *)groups) != 0)) { + nvlist_free(nvp); + crfree(usercred); + return (EPERM); + } + nvlist_free(nvp); + error = dsl_deleg_access(zc->zc_name, + zfs_prop_to_name(ZFS_PROP_SHAREISCSI), usercred); + crfree(usercred); + return (error); +} + +/* + * inputs: + * zc_name name of filesystem + * zc_nvlist_src{_size} nvlist of delegated permissions + * zc_perm_action allow/unallow flag + * + * outputs: none + */ +static int +zfs_ioc_set_fsacl(zfs_cmd_t *zc) +{ + int error; + nvlist_t *fsaclnv = NULL; + + if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + &fsaclnv)) != 0) + return (error); + + /* + * Verify nvlist is constructed correctly + */ + if ((error = zfs_deleg_verify_nvlist(fsaclnv)) != 0) { + nvlist_free(fsaclnv); + return (EINVAL); + } + + /* + * If we don't have PRIV_SYS_MOUNT, then validate + * that user is allowed to hand out each permission in + * the nvlist(s) + */ + + error = secpolicy_zfs(CRED()); + if (error) { + if (zc->zc_perm_action == B_FALSE) { + error = dsl_deleg_can_allow(zc->zc_name, + fsaclnv, CRED()); + } else { + error = dsl_deleg_can_unallow(zc->zc_name, + fsaclnv, CRED()); + } + } + + if (error == 0) + error = dsl_deleg_set(zc->zc_name, fsaclnv, zc->zc_perm_action); + + nvlist_free(fsaclnv); + return (error); +} + +/* + * inputs: + * zc_name name of filesystem + * + * outputs: + * zc_nvlist_src{_size} nvlist of delegated permissions + */ +static int +zfs_ioc_get_fsacl(zfs_cmd_t *zc) +{ + nvlist_t *nvp; + int error; + + if ((error = dsl_deleg_get(zc->zc_name, &nvp)) == 0) { + error = put_nvlist(zc, nvp); + nvlist_free(nvp); + } + + return (error); +} + +/* + * inputs: + * zc_name name of volume + * + * outputs: none + */ +static int +zfs_ioc_create_minor(zfs_cmd_t *zc) +{ + return (zvol_create_minor(zc->zc_name, ddi_driver_major(zfs_dip))); +} + +/* + * inputs: + * zc_name name of volume + * + * outputs: none + */ +static int +zfs_ioc_remove_minor(zfs_cmd_t *zc) +{ + return (zvol_remove_minor(zc->zc_name)); +} + +/* + * Search the vfs list for a specified resource. Returns a pointer to it + * or NULL if no suitable entry is found. The caller of this routine + * is responsible for releasing the returned vfs pointer. + */ +static vfs_t * +zfs_get_vfs(const char *resource) +{ + struct vfs *vfsp; + struct vfs *vfs_found = NULL; + + vfs_list_read_lock(); + vfsp = rootvfs; + do { + if (strcmp(refstr_value(vfsp->vfs_resource), resource) == 0) { + VFS_HOLD(vfsp); + vfs_found = vfsp; + break; + } + vfsp = vfsp->vfs_next; + } while (vfsp != rootvfs); + vfs_list_unlock(); + return (vfs_found); +} + +/* ARGSUSED */ +static void +zfs_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) +{ + zfs_creat_t *zct = arg; + + zfs_create_fs(os, cr, zct->zct_zplprops, tx); +} + +#define ZFS_PROP_UNDEFINED ((uint64_t)-1) + +/* + * inputs: + * createprops list of properties requested by creator + * dataset name of dataset we are creating + * + * outputs: + * zplprops values for the zplprops we attach to the master node object + * + * Determine the settings for utf8only, normalization and + * casesensitivity. Specific values may have been requested by the + * creator and/or we can inherit values from the parent dataset. If + * the file system is of too early a vintage, a creator can not + * request settings for these properties, even if the requested + * setting is the default value. We don't actually want to create dsl + * properties for these, so remove them from the source nvlist after + * processing. + */ +static int +zfs_fill_zplprops(const char *dataset, nvlist_t *createprops, + nvlist_t *zplprops, uint64_t zplver, boolean_t *is_ci) +{ + objset_t *os; + char parentname[MAXNAMELEN]; + char *cp; + uint64_t sense = ZFS_PROP_UNDEFINED; + uint64_t norm = ZFS_PROP_UNDEFINED; + uint64_t u8 = ZFS_PROP_UNDEFINED; + int error = 0; + + ASSERT(zplprops != NULL); + + (void) strlcpy(parentname, dataset, sizeof (parentname)); + cp = strrchr(parentname, '/'); + ASSERT(cp != NULL); + cp[0] = '\0'; + + /* + * Pull out creator prop choices, if any. + */ + if (createprops) { + (void) nvlist_lookup_uint64(createprops, + zfs_prop_to_name(ZFS_PROP_NORMALIZE), &norm); + (void) nvlist_remove_all(createprops, + zfs_prop_to_name(ZFS_PROP_NORMALIZE)); + (void) nvlist_lookup_uint64(createprops, + zfs_prop_to_name(ZFS_PROP_UTF8ONLY), &u8); + (void) nvlist_remove_all(createprops, + zfs_prop_to_name(ZFS_PROP_UTF8ONLY)); + (void) nvlist_lookup_uint64(createprops, + zfs_prop_to_name(ZFS_PROP_CASE), &sense); + (void) nvlist_remove_all(createprops, + zfs_prop_to_name(ZFS_PROP_CASE)); + } + + /* + * If the file system or pool is version is too "young" to + * support normalization and the creator tried to set a value + * for one of the props, error out. We only need check the + * ZPL version because we've already checked by now that the + * SPA version is compatible with the selected ZPL version. + */ + if (zplver < ZPL_VERSION_NORMALIZATION && + (norm != ZFS_PROP_UNDEFINED || u8 != ZFS_PROP_UNDEFINED || + sense != ZFS_PROP_UNDEFINED)) + return (ENOTSUP); + + /* + * Put the version in the zplprops + */ + VERIFY(nvlist_add_uint64(zplprops, + zfs_prop_to_name(ZFS_PROP_VERSION), zplver) == 0); + + /* + * Open parent object set so we can inherit zplprop values if + * necessary. + */ + if ((error = zfs_os_open_retry(parentname, &os)) != 0) + return (error); + + if (norm == ZFS_PROP_UNDEFINED) + VERIFY(zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm) == 0); + VERIFY(nvlist_add_uint64(zplprops, + zfs_prop_to_name(ZFS_PROP_NORMALIZE), norm) == 0); + + /* + * If we're normalizing, names must always be valid UTF-8 strings. + */ + if (norm) + u8 = 1; + if (u8 == ZFS_PROP_UNDEFINED) + VERIFY(zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8) == 0); + VERIFY(nvlist_add_uint64(zplprops, + zfs_prop_to_name(ZFS_PROP_UTF8ONLY), u8) == 0); + + if (sense == ZFS_PROP_UNDEFINED) + VERIFY(zfs_get_zplprop(os, ZFS_PROP_CASE, &sense) == 0); + VERIFY(nvlist_add_uint64(zplprops, + zfs_prop_to_name(ZFS_PROP_CASE), sense) == 0); + + if (is_ci) + *is_ci = (sense == ZFS_CASE_INSENSITIVE); + + dmu_objset_close(os); + return (0); +} + +/* + * inputs: + * zc_objset_type type of objset to create (fs vs zvol) + * zc_name name of new objset + * zc_value name of snapshot to clone from (may be empty) + * zc_nvlist_src{_size} nvlist of properties to apply + * + * outputs: none + */ +static int +zfs_ioc_create(zfs_cmd_t *zc) +{ + objset_t *clone; + int error = 0; + zfs_creat_t zct; + nvlist_t *nvprops = NULL; + void (*cbfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); + dmu_objset_type_t type = zc->zc_objset_type; + + switch (type) { + + case DMU_OST_ZFS: + cbfunc = zfs_create_cb; + break; + + case DMU_OST_ZVOL: + cbfunc = zvol_create_cb; + break; + + default: + cbfunc = NULL; + break; + } + if (strchr(zc->zc_name, '@') || + strchr(zc->zc_name, '%')) + return (EINVAL); + + if (zc->zc_nvlist_src != NULL && + (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + &nvprops)) != 0) + return (error); + + zct.zct_zplprops = NULL; + zct.zct_props = nvprops; + + if (zc->zc_value[0] != '\0') { + /* + * We're creating a clone of an existing snapshot. + */ + zc->zc_value[sizeof (zc->zc_value) - 1] = '\0'; + if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0) { + nvlist_free(nvprops); + return (EINVAL); + } + + error = dmu_objset_open(zc->zc_value, type, + DS_MODE_STANDARD | DS_MODE_READONLY, &clone); + if (error) { + nvlist_free(nvprops); + return (error); + } + + error = dmu_objset_create(zc->zc_name, type, clone, 0, + NULL, NULL); + if (error) { + dmu_objset_close(clone); + nvlist_free(nvprops); + return (error); + } + dmu_objset_close(clone); + } else { + boolean_t is_insensitive = B_FALSE; + + if (cbfunc == NULL) { + nvlist_free(nvprops); + return (EINVAL); + } + + if (type == DMU_OST_ZVOL) { + uint64_t volsize, volblocksize; + + if (nvprops == NULL || + nvlist_lookup_uint64(nvprops, + zfs_prop_to_name(ZFS_PROP_VOLSIZE), + &volsize) != 0) { + nvlist_free(nvprops); + return (EINVAL); + } + + if ((error = nvlist_lookup_uint64(nvprops, + zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), + &volblocksize)) != 0 && error != ENOENT) { + nvlist_free(nvprops); + return (EINVAL); + } + + if (error != 0) + volblocksize = zfs_prop_default_numeric( + ZFS_PROP_VOLBLOCKSIZE); + + if ((error = zvol_check_volblocksize( + volblocksize)) != 0 || + (error = zvol_check_volsize(volsize, + volblocksize)) != 0) { + nvlist_free(nvprops); + return (error); + } + } else if (type == DMU_OST_ZFS) { + uint64_t version; + int error; + + /* + * Default ZPL version to non-FUID capable if the + * pool is not upgraded to support FUIDs. + */ + if (zfs_check_version(zc->zc_name, SPA_VERSION_FUID)) + version = ZPL_VERSION_FUID - 1; + else + version = ZPL_VERSION; + + /* + * Potentially override default ZPL version based + * on creator's request. + */ + (void) nvlist_lookup_uint64(nvprops, + zfs_prop_to_name(ZFS_PROP_VERSION), &version); + + /* + * Make sure version we ended up with is kosher + */ + if ((version < ZPL_VERSION_INITIAL || + version > ZPL_VERSION) || + (version >= ZPL_VERSION_FUID && + zfs_check_version(zc->zc_name, SPA_VERSION_FUID))) { + nvlist_free(nvprops); + return (ENOTSUP); + } + + /* + * We have to have normalization and + * case-folding flags correct when we do the + * file system creation, so go figure them out + * now. + */ + VERIFY(nvlist_alloc(&zct.zct_zplprops, + NV_UNIQUE_NAME, KM_SLEEP) == 0); + error = zfs_fill_zplprops(zc->zc_name, nvprops, + zct.zct_zplprops, version, &is_insensitive); + if (error != 0) { + nvlist_free(nvprops); + nvlist_free(zct.zct_zplprops); + return (error); + } + } + error = dmu_objset_create(zc->zc_name, type, NULL, + is_insensitive ? DS_FLAG_CI_DATASET : 0, cbfunc, &zct); + nvlist_free(zct.zct_zplprops); + } + + /* + * It would be nice to do this atomically. + */ + if (error == 0) { + if ((error = zfs_set_prop_nvlist(zc->zc_name, nvprops)) != 0) + (void) dmu_objset_destroy(zc->zc_name); + } + nvlist_free(nvprops); + return (error); +} + +/* + * inputs: + * zc_name name of filesystem + * zc_value short name of snapshot + * zc_cookie recursive flag + * + * outputs: none + */ +static int +zfs_ioc_snapshot(zfs_cmd_t *zc) +{ + if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0) + return (EINVAL); + return (dmu_objset_snapshot(zc->zc_name, + zc->zc_value, zc->zc_cookie)); +} + +int +zfs_unmount_snap(char *name, void *arg) +{ + char *snapname = arg; + char *cp; + vfs_t *vfsp = NULL; + + /* + * Snapshots (which are under .zfs control) must be unmounted + * before they can be destroyed. + */ + + if (snapname) { + (void) strcat(name, "@"); + (void) strcat(name, snapname); + vfsp = zfs_get_vfs(name); + cp = strchr(name, '@'); + *cp = '\0'; + } else if (strchr(name, '@')) { + vfsp = zfs_get_vfs(name); + } + + if (vfsp) { + /* + * Always force the unmount for snapshots. + */ + int flag = MS_FORCE; + int err; + + if ((err = vn_vfswlock(vfsp->vfs_vnodecovered)) != 0) { + VFS_RELE(vfsp); + return (err); + } + VFS_RELE(vfsp); + if ((err = dounmount(vfsp, flag, kcred)) != 0) + return (err); + } + return (0); +} + +/* + * inputs: + * zc_name name of filesystem + * zc_value short name of snapshot + * + * outputs: none + */ +static int +zfs_ioc_destroy_snaps(zfs_cmd_t *zc) +{ + int err; + + if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0) + return (EINVAL); + err = dmu_objset_find(zc->zc_name, + zfs_unmount_snap, zc->zc_value, DS_FIND_CHILDREN); + if (err) + return (err); + return (dmu_snapshots_destroy(zc->zc_name, zc->zc_value)); +} + +/* + * inputs: + * zc_name name of dataset to destroy + * zc_objset_type type of objset + * + * outputs: none + */ +static int +zfs_ioc_destroy(zfs_cmd_t *zc) +{ + if (strchr(zc->zc_name, '@') && zc->zc_objset_type == DMU_OST_ZFS) { + int err = zfs_unmount_snap(zc->zc_name, NULL); + if (err) + return (err); + } + + return (dmu_objset_destroy(zc->zc_name)); +} + +/* + * inputs: + * zc_name name of dataset to rollback (to most recent snapshot) + * + * outputs: none + */ +static int +zfs_ioc_rollback(zfs_cmd_t *zc) +{ + objset_t *os; + int error; + zfsvfs_t *zfsvfs = NULL; + + /* + * Get the zfsvfs for the receiving objset. There + * won't be one if we're operating on a zvol, if the + * objset doesn't exist yet, or is not mounted. + */ + error = dmu_objset_open(zc->zc_name, DMU_OST_ANY, + DS_MODE_STANDARD, &os); + if (error) + return (error); + + if (dmu_objset_type(os) == DMU_OST_ZFS) { + mutex_enter(&os->os->os_user_ptr_lock); + zfsvfs = dmu_objset_get_user(os); + if (zfsvfs != NULL) + VFS_HOLD(zfsvfs->z_vfs); + mutex_exit(&os->os->os_user_ptr_lock); + } + + if (zfsvfs != NULL) { + char osname[MAXNAMELEN]; + int mode; + + error = zfs_suspend_fs(zfsvfs, osname, &mode); + if (error == 0) { + int resume_err; + + ASSERT(strcmp(osname, zc->zc_name) == 0); + error = dmu_objset_rollback(os); + resume_err = zfs_resume_fs(zfsvfs, osname, mode); + error = error ? error : resume_err; + } else { + dmu_objset_close(os); + } + VFS_RELE(zfsvfs->z_vfs); + } else { + error = dmu_objset_rollback(os); + } + /* Note, the dmu_objset_rollback() closes the objset for us. */ + + return (error); +} + +/* + * inputs: + * zc_name old name of dataset + * zc_value new name of dataset + * zc_cookie recursive flag (only valid for snapshots) + * + * outputs: none + */ +static int +zfs_ioc_rename(zfs_cmd_t *zc) +{ + boolean_t recursive = zc->zc_cookie & 1; + + zc->zc_value[sizeof (zc->zc_value) - 1] = '\0'; + if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 || + strchr(zc->zc_value, '%')) + return (EINVAL); + + /* + * Unmount snapshot unless we're doing a recursive rename, + * in which case the dataset code figures out which snapshots + * to unmount. + */ + if (!recursive && strchr(zc->zc_name, '@') != NULL && + zc->zc_objset_type == DMU_OST_ZFS) { + int err = zfs_unmount_snap(zc->zc_name, NULL); + if (err) + return (err); + } + + return (dmu_objset_rename(zc->zc_name, zc->zc_value, recursive)); +} + +/* + * inputs: + * zc_name name of containing filesystem + * zc_nvlist_src{_size} nvlist of properties to apply + * zc_value name of snapshot to create + * zc_string name of clone origin (if DRR_FLAG_CLONE) + * zc_cookie file descriptor to recv from + * zc_begin_record the BEGIN record of the stream (not byteswapped) + * zc_guid force flag + * + * outputs: + * zc_cookie number of bytes read + */ +static int +zfs_ioc_recv(zfs_cmd_t *zc) +{ + file_t *fp; + objset_t *os; + dmu_recv_cookie_t drc; + zfsvfs_t *zfsvfs = NULL; + boolean_t force = (boolean_t)zc->zc_guid; + int error, fd; + offset_t off; + nvlist_t *props = NULL; + objset_t *origin = NULL; + char *tosnap; + char tofs[ZFS_MAXNAMELEN]; + + if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 || + strchr(zc->zc_value, '@') == NULL || + strchr(zc->zc_value, '%')) + return (EINVAL); + + (void) strcpy(tofs, zc->zc_value); + tosnap = strchr(tofs, '@'); + *tosnap = '\0'; + tosnap++; + + if (zc->zc_nvlist_src != NULL && + (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + &props)) != 0) + return (error); + + fd = zc->zc_cookie; + fp = getf(fd); + if (fp == NULL) { + nvlist_free(props); + return (EBADF); + } + + /* + * Get the zfsvfs for the receiving objset. There + * won't be one if we're operating on a zvol, if the + * objset doesn't exist yet, or is not mounted. + */ + + error = dmu_objset_open(tofs, DMU_OST_ZFS, + DS_MODE_STANDARD | DS_MODE_READONLY, &os); + if (!error) { + mutex_enter(&os->os->os_user_ptr_lock); + zfsvfs = dmu_objset_get_user(os); + if (zfsvfs != NULL) { + VFS_HOLD(zfsvfs->z_vfs); + mutex_exit(&os->os->os_user_ptr_lock); + if (!mutex_tryenter(&zfsvfs->z_online_recv_lock)) { + VFS_RELE(zfsvfs->z_vfs); + dmu_objset_close(os); + nvlist_free(props); + releasef(fd); + return (EBUSY); + } + } else { + mutex_exit(&os->os->os_user_ptr_lock); + } + dmu_objset_close(os); + } + + if (zc->zc_string[0]) { + error = dmu_objset_open(zc->zc_string, DMU_OST_ANY, + DS_MODE_STANDARD | DS_MODE_READONLY, &origin); + if (error) { + if (zfsvfs != NULL) { + mutex_exit(&zfsvfs->z_online_recv_lock); + VFS_RELE(zfsvfs->z_vfs); + } + nvlist_free(props); + releasef(fd); + return (error); + } + } + + error = dmu_recv_begin(tofs, tosnap, &zc->zc_begin_record, + force, origin, zfsvfs != NULL, &drc); + if (origin) + dmu_objset_close(origin); + if (error) { + if (zfsvfs != NULL) { + mutex_exit(&zfsvfs->z_online_recv_lock); + VFS_RELE(zfsvfs->z_vfs); + } + nvlist_free(props); + releasef(fd); + return (error); + } + + /* + * If properties are supplied, they are to completely replace + * the existing ones; "inherit" any existing properties. + */ + if (props) { + objset_t *os; + nvlist_t *nv = NULL; + + error = dmu_objset_open(tofs, DMU_OST_ANY, + DS_MODE_STANDARD | DS_MODE_READONLY | DS_MODE_INCONSISTENT, + &os); + if (error == 0) { + error = dsl_prop_get_all(os, &nv); + dmu_objset_close(os); + } + if (error == 0) { + nvpair_t *elem; + zfs_cmd_t *zc2; + zc2 = kmem_alloc(sizeof (zfs_cmd_t), KM_SLEEP); + + (void) strcpy(zc2->zc_name, tofs); + for (elem = nvlist_next_nvpair(nv, NULL); elem; + elem = nvlist_next_nvpair(nv, elem)) { + (void) strcpy(zc2->zc_value, nvpair_name(elem)); + if (zfs_secpolicy_inherit(zc2, CRED()) == 0) + (void) zfs_ioc_inherit_prop(zc2); + } + kmem_free(zc2, sizeof (zfs_cmd_t)); + } + if (nv) + nvlist_free(nv); + } + + /* + * Set properties. Note, we ignore errors. Would be better to + * do best-effort in zfs_set_prop_nvlist, too. + */ + (void) zfs_set_prop_nvlist(tofs, props); + nvlist_free(props); + + off = fp->f_offset; + error = dmu_recv_stream(&drc, fp->f_vnode, &off); + + if (error == 0) { + if (zfsvfs != NULL) { + char osname[MAXNAMELEN]; + int mode; + + error = zfs_suspend_fs(zfsvfs, osname, &mode); + if (error == 0) { + int resume_err; + + error = dmu_recv_end(&drc); + resume_err = zfs_resume_fs(zfsvfs, + osname, mode); + error = error ? error : resume_err; + } else { + dmu_recv_abort_cleanup(&drc); + } + } else { + error = dmu_recv_end(&drc); + } + } + if (zfsvfs != NULL) { + mutex_exit(&zfsvfs->z_online_recv_lock); + VFS_RELE(zfsvfs->z_vfs); + } + + zc->zc_cookie = off - fp->f_offset; + if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0) + fp->f_offset = off; + + releasef(fd); + return (error); +} + +/* + * inputs: + * zc_name name of snapshot to send + * zc_value short name of incremental fromsnap (may be empty) + * zc_cookie file descriptor to send stream to + * zc_obj fromorigin flag (mutually exclusive with zc_value) + * + * outputs: none + */ +static int +zfs_ioc_send(zfs_cmd_t *zc) +{ + objset_t *fromsnap = NULL; + objset_t *tosnap; + file_t *fp; + int error; + offset_t off; + + error = dmu_objset_open(zc->zc_name, DMU_OST_ANY, + DS_MODE_STANDARD | DS_MODE_READONLY, &tosnap); + if (error) + return (error); + + if (zc->zc_value[0] != '\0') { + char buf[MAXPATHLEN]; + char *cp; + + (void) strncpy(buf, zc->zc_name, sizeof (buf)); + cp = strchr(buf, '@'); + if (cp) + *(cp+1) = 0; + (void) strncat(buf, zc->zc_value, sizeof (buf)); + error = dmu_objset_open(buf, DMU_OST_ANY, + DS_MODE_STANDARD | DS_MODE_READONLY, &fromsnap); + if (error) { + dmu_objset_close(tosnap); + return (error); + } + } + + fp = getf(zc->zc_cookie); + if (fp == NULL) { + dmu_objset_close(tosnap); + if (fromsnap) + dmu_objset_close(fromsnap); + return (EBADF); + } + + off = fp->f_offset; + error = dmu_sendbackup(tosnap, fromsnap, zc->zc_obj, fp->f_vnode, &off); + + if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0) + fp->f_offset = off; + releasef(zc->zc_cookie); + if (fromsnap) + dmu_objset_close(fromsnap); + dmu_objset_close(tosnap); + return (error); +} + +static int +zfs_ioc_inject_fault(zfs_cmd_t *zc) +{ + int id, error; + + error = zio_inject_fault(zc->zc_name, (int)zc->zc_guid, &id, + &zc->zc_inject_record); + + if (error == 0) + zc->zc_guid = (uint64_t)id; + + return (error); +} + +static int +zfs_ioc_clear_fault(zfs_cmd_t *zc) +{ + return (zio_clear_fault((int)zc->zc_guid)); +} + +static int +zfs_ioc_inject_list_next(zfs_cmd_t *zc) +{ + int id = (int)zc->zc_guid; + int error; + + error = zio_inject_list_next(&id, zc->zc_name, sizeof (zc->zc_name), + &zc->zc_inject_record); + + zc->zc_guid = id; + + return (error); +} + +static int +zfs_ioc_error_log(zfs_cmd_t *zc) +{ + spa_t *spa; + int error; + size_t count = (size_t)zc->zc_nvlist_dst_size; + + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) + return (error); + + error = spa_get_errlog(spa, (void *)(uintptr_t)zc->zc_nvlist_dst, + &count); + if (error == 0) + zc->zc_nvlist_dst_size = count; + else + zc->zc_nvlist_dst_size = spa_get_errlog_size(spa); + + spa_close(spa, FTAG); + + return (error); +} + +static int +zfs_ioc_clear(zfs_cmd_t *zc) +{ + spa_t *spa; + vdev_t *vd; + uint64_t txg; + int error; + + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) + return (error); + + /* + * Try to resume any I/Os which may have been suspended + * as a result of a complete pool failure. + */ + if (!list_is_empty(&spa->spa_zio_list)) { + if (zio_vdev_resume_io(spa) != 0) { + spa_close(spa, FTAG); + return (EIO); + } + } + + txg = spa_vdev_enter(spa); + + if (zc->zc_guid == 0) { + vd = NULL; + } else if ((vd = spa_lookup_by_guid(spa, zc->zc_guid)) == NULL) { + spa_aux_vdev_t *sav; + int i; + + /* + * Check if this is an l2cache device. + */ + ASSERT(spa != NULL); + sav = &spa->spa_l2cache; + for (i = 0; i < sav->sav_count; i++) { + if (sav->sav_vdevs[i]->vdev_guid == zc->zc_guid) { + vd = sav->sav_vdevs[i]; + break; + } + } + + if (vd == NULL) { + (void) spa_vdev_exit(spa, NULL, txg, ENODEV); + spa_close(spa, FTAG); + return (ENODEV); + } + } + + vdev_clear(spa, vd, B_TRUE); + + (void) spa_vdev_exit(spa, NULL, txg, 0); + + spa_close(spa, FTAG); + + return (0); +} + +/* + * inputs: + * zc_name name of filesystem + * zc_value name of origin snapshot + * + * outputs: none + */ +static int +zfs_ioc_promote(zfs_cmd_t *zc) +{ + char *cp; + + /* + * We don't need to unmount *all* the origin fs's snapshots, but + * it's easier. + */ + cp = strchr(zc->zc_value, '@'); + if (cp) + *cp = '\0'; + (void) dmu_objset_find(zc->zc_value, + zfs_unmount_snap, NULL, DS_FIND_SNAPSHOTS); + return (dsl_dataset_promote(zc->zc_name)); +} + +/* + * We don't want to have a hard dependency + * against some special symbols in sharefs + * nfs, and smbsrv. Determine them if needed when + * the first file system is shared. + * Neither sharefs, nfs or smbsrv are unloadable modules. + */ +int (*znfsexport_fs)(void *arg); +int (*zshare_fs)(enum sharefs_sys_op, share_t *, uint32_t); +int (*zsmbexport_fs)(void *arg, boolean_t add_share); + +int zfs_nfsshare_inited; +int zfs_smbshare_inited; + +ddi_modhandle_t nfs_mod; +ddi_modhandle_t sharefs_mod; +ddi_modhandle_t smbsrv_mod; +kmutex_t zfs_share_lock; + +static int +zfs_init_sharefs() +{ + int error; + + ASSERT(MUTEX_HELD(&zfs_share_lock)); + /* Both NFS and SMB shares also require sharetab support. */ + if (sharefs_mod == NULL && ((sharefs_mod = + ddi_modopen("fs/sharefs", + KRTLD_MODE_FIRST, &error)) == NULL)) { + return (ENOSYS); + } + if (zshare_fs == NULL && ((zshare_fs = + (int (*)(enum sharefs_sys_op, share_t *, uint32_t)) + ddi_modsym(sharefs_mod, "sharefs_impl", &error)) == NULL)) { + return (ENOSYS); + } + return (0); +} + +static int +zfs_ioc_share(zfs_cmd_t *zc) +{ + int error; + int opcode; + + switch (zc->zc_share.z_sharetype) { + case ZFS_SHARE_NFS: + case ZFS_UNSHARE_NFS: + if (zfs_nfsshare_inited == 0) { + mutex_enter(&zfs_share_lock); + if (nfs_mod == NULL && ((nfs_mod = ddi_modopen("fs/nfs", + KRTLD_MODE_FIRST, &error)) == NULL)) { + mutex_exit(&zfs_share_lock); + return (ENOSYS); + } + if (znfsexport_fs == NULL && + ((znfsexport_fs = (int (*)(void *)) + ddi_modsym(nfs_mod, + "nfs_export", &error)) == NULL)) { + mutex_exit(&zfs_share_lock); + return (ENOSYS); + } + error = zfs_init_sharefs(); + if (error) { + mutex_exit(&zfs_share_lock); + return (ENOSYS); + } + zfs_nfsshare_inited = 1; + mutex_exit(&zfs_share_lock); + } + break; + case ZFS_SHARE_SMB: + case ZFS_UNSHARE_SMB: + if (zfs_smbshare_inited == 0) { + mutex_enter(&zfs_share_lock); + if (smbsrv_mod == NULL && ((smbsrv_mod = + ddi_modopen("drv/smbsrv", + KRTLD_MODE_FIRST, &error)) == NULL)) { + mutex_exit(&zfs_share_lock); + return (ENOSYS); + } + if (zsmbexport_fs == NULL && ((zsmbexport_fs = + (int (*)(void *, boolean_t))ddi_modsym(smbsrv_mod, + "smb_server_share", &error)) == NULL)) { + mutex_exit(&zfs_share_lock); + return (ENOSYS); + } + error = zfs_init_sharefs(); + if (error) { + mutex_exit(&zfs_share_lock); + return (ENOSYS); + } + zfs_smbshare_inited = 1; + mutex_exit(&zfs_share_lock); + } + break; + default: + return (EINVAL); + } + + switch (zc->zc_share.z_sharetype) { + case ZFS_SHARE_NFS: + case ZFS_UNSHARE_NFS: + if (error = + znfsexport_fs((void *) + (uintptr_t)zc->zc_share.z_exportdata)) + return (error); + break; + case ZFS_SHARE_SMB: + case ZFS_UNSHARE_SMB: + if (error = zsmbexport_fs((void *) + (uintptr_t)zc->zc_share.z_exportdata, + zc->zc_share.z_sharetype == ZFS_SHARE_SMB ? + B_TRUE : B_FALSE)) { + return (error); + } + break; + } + + opcode = (zc->zc_share.z_sharetype == ZFS_SHARE_NFS || + zc->zc_share.z_sharetype == ZFS_SHARE_SMB) ? + SHAREFS_ADD : SHAREFS_REMOVE; + + /* + * Add or remove share from sharetab + */ + error = zshare_fs(opcode, + (void *)(uintptr_t)zc->zc_share.z_sharedata, + zc->zc_share.z_sharemax); + + return (error); + +} + +/* + * pool create, destroy, and export don't log the history as part of + * zfsdev_ioctl, but rather zfs_ioc_pool_create, and zfs_ioc_pool_export + * do the logging of those commands. + */ +static zfs_ioc_vec_t zfs_ioc_vec[] = { + { zfs_ioc_pool_create, zfs_secpolicy_config, POOL_NAME, B_FALSE }, + { zfs_ioc_pool_destroy, zfs_secpolicy_config, POOL_NAME, B_FALSE }, + { zfs_ioc_pool_import, zfs_secpolicy_config, POOL_NAME, B_TRUE }, + { zfs_ioc_pool_export, zfs_secpolicy_config, POOL_NAME, B_FALSE }, + { zfs_ioc_pool_configs, zfs_secpolicy_none, NO_NAME, B_FALSE }, + { zfs_ioc_pool_stats, zfs_secpolicy_read, POOL_NAME, B_FALSE }, + { zfs_ioc_pool_tryimport, zfs_secpolicy_config, NO_NAME, B_FALSE }, + { zfs_ioc_pool_scrub, zfs_secpolicy_config, POOL_NAME, B_TRUE }, + { zfs_ioc_pool_freeze, zfs_secpolicy_config, NO_NAME, B_FALSE }, + { zfs_ioc_pool_upgrade, zfs_secpolicy_config, POOL_NAME, B_TRUE }, + { zfs_ioc_pool_get_history, zfs_secpolicy_config, POOL_NAME, B_FALSE }, + { zfs_ioc_vdev_add, zfs_secpolicy_config, POOL_NAME, B_TRUE }, + { zfs_ioc_vdev_remove, zfs_secpolicy_config, POOL_NAME, B_TRUE }, + { zfs_ioc_vdev_set_state, zfs_secpolicy_config, POOL_NAME, B_TRUE }, + { zfs_ioc_vdev_attach, zfs_secpolicy_config, POOL_NAME, B_TRUE }, + { zfs_ioc_vdev_detach, zfs_secpolicy_config, POOL_NAME, B_TRUE }, + { zfs_ioc_vdev_setpath, zfs_secpolicy_config, POOL_NAME, B_FALSE }, + { zfs_ioc_objset_stats, zfs_secpolicy_read, DATASET_NAME, B_FALSE }, + { zfs_ioc_objset_zplprops, zfs_secpolicy_read, DATASET_NAME, B_FALSE }, + { zfs_ioc_dataset_list_next, zfs_secpolicy_read, + DATASET_NAME, B_FALSE }, + { zfs_ioc_snapshot_list_next, zfs_secpolicy_read, + DATASET_NAME, B_FALSE }, + { zfs_ioc_set_prop, zfs_secpolicy_none, DATASET_NAME, B_TRUE }, + { zfs_ioc_create_minor, zfs_secpolicy_minor, DATASET_NAME, B_FALSE }, + { zfs_ioc_remove_minor, zfs_secpolicy_minor, DATASET_NAME, B_FALSE }, + { zfs_ioc_create, zfs_secpolicy_create, DATASET_NAME, B_TRUE }, + { zfs_ioc_destroy, zfs_secpolicy_destroy, DATASET_NAME, B_TRUE }, + { zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME, B_TRUE }, + { zfs_ioc_rename, zfs_secpolicy_rename, DATASET_NAME, B_TRUE }, + { zfs_ioc_recv, zfs_secpolicy_receive, DATASET_NAME, B_TRUE }, + { zfs_ioc_send, zfs_secpolicy_send, DATASET_NAME, B_TRUE }, + { zfs_ioc_inject_fault, zfs_secpolicy_inject, NO_NAME, B_FALSE }, + { zfs_ioc_clear_fault, zfs_secpolicy_inject, NO_NAME, B_FALSE }, + { zfs_ioc_inject_list_next, zfs_secpolicy_inject, NO_NAME, B_FALSE }, + { zfs_ioc_error_log, zfs_secpolicy_inject, POOL_NAME, B_FALSE }, + { zfs_ioc_clear, zfs_secpolicy_config, POOL_NAME, B_TRUE }, + { zfs_ioc_promote, zfs_secpolicy_promote, DATASET_NAME, B_TRUE }, + { zfs_ioc_destroy_snaps, zfs_secpolicy_destroy, DATASET_NAME, B_TRUE }, + { zfs_ioc_snapshot, zfs_secpolicy_snapshot, DATASET_NAME, B_TRUE }, + { zfs_ioc_dsobj_to_dsname, zfs_secpolicy_config, POOL_NAME, B_FALSE }, + { zfs_ioc_obj_to_path, zfs_secpolicy_config, NO_NAME, B_FALSE }, + { zfs_ioc_pool_set_props, zfs_secpolicy_config, POOL_NAME, B_TRUE }, + { zfs_ioc_pool_get_props, zfs_secpolicy_read, POOL_NAME, B_FALSE }, + { zfs_ioc_set_fsacl, zfs_secpolicy_fsacl, DATASET_NAME, B_TRUE }, + { zfs_ioc_get_fsacl, zfs_secpolicy_read, DATASET_NAME, B_FALSE }, + { zfs_ioc_iscsi_perm_check, zfs_secpolicy_iscsi, + DATASET_NAME, B_FALSE }, + { zfs_ioc_share, zfs_secpolicy_share, DATASET_NAME, B_FALSE }, + { zfs_ioc_inherit_prop, zfs_secpolicy_inherit, DATASET_NAME, B_TRUE }, +}; + +static int +zfsdev_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) +{ + zfs_cmd_t *zc; + uint_t vec; + int error, rc; + + if (getminor(dev) != 0) + return (zvol_ioctl(dev, cmd, arg, flag, cr, rvalp)); + + vec = cmd - ZFS_IOC; + ASSERT3U(getmajor(dev), ==, ddi_driver_major(zfs_dip)); + + if (vec >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0])) + return (EINVAL); + + zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); + + error = xcopyin((void *)arg, zc, sizeof (zfs_cmd_t)); + + if (error == 0) + error = zfs_ioc_vec[vec].zvec_secpolicy(zc, cr); + + /* + * Ensure that all pool/dataset names are valid before we pass down to + * the lower layers. + */ + if (error == 0) { + zc->zc_name[sizeof (zc->zc_name) - 1] = '\0'; + switch (zfs_ioc_vec[vec].zvec_namecheck) { + case POOL_NAME: + if (pool_namecheck(zc->zc_name, NULL, NULL) != 0) + error = EINVAL; + break; + + case DATASET_NAME: + if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0) + error = EINVAL; + break; + + case NO_NAME: + break; + } + } + + if (error == 0) + error = zfs_ioc_vec[vec].zvec_func(zc); + + rc = xcopyout(zc, (void *)arg, sizeof (zfs_cmd_t)); + if (error == 0) { + error = rc; + if (zfs_ioc_vec[vec].zvec_his_log == B_TRUE) + zfs_log_history(zc); + } + + kmem_free(zc, sizeof (zfs_cmd_t)); + return (error); +} + +static int +zfs_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + if (cmd != DDI_ATTACH) + return (DDI_FAILURE); + + if (ddi_create_minor_node(dip, "zfs", S_IFCHR, 0, + DDI_PSEUDO, 0) == DDI_FAILURE) + return (DDI_FAILURE); + + zfs_dip = dip; + + ddi_report_dev(dip); + + return (DDI_SUCCESS); +} + +static int +zfs_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + if (spa_busy() || zfs_busy() || zvol_busy()) + return (DDI_FAILURE); + + if (cmd != DDI_DETACH) + return (DDI_FAILURE); + + zfs_dip = NULL; + + ddi_prop_remove_all(dip); + ddi_remove_minor_node(dip, NULL); + + return (DDI_SUCCESS); +} + +/*ARGSUSED*/ +static int +zfs_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) +{ + switch (infocmd) { + case DDI_INFO_DEVT2DEVINFO: + *result = zfs_dip; + return (DDI_SUCCESS); + + case DDI_INFO_DEVT2INSTANCE: + *result = (void *)0; + return (DDI_SUCCESS); + } + + return (DDI_FAILURE); +} + +/* + * OK, so this is a little weird. + * + * /dev/zfs is the control node, i.e. minor 0. + * /dev/zvol/[r]dsk/pool/dataset are the zvols, minor > 0. + * + * /dev/zfs has basically nothing to do except serve up ioctls, + * so most of the standard driver entry points are in zvol.c. + */ +static struct cb_ops zfs_cb_ops = { + zvol_open, /* open */ + zvol_close, /* close */ + zvol_strategy, /* strategy */ + nodev, /* print */ + zvol_dump, /* dump */ + zvol_read, /* read */ + zvol_write, /* write */ + zfsdev_ioctl, /* ioctl */ + nodev, /* devmap */ + nodev, /* mmap */ + nodev, /* segmap */ + nochpoll, /* poll */ + ddi_prop_op, /* prop_op */ + NULL, /* streamtab */ + D_NEW | D_MP | D_64BIT, /* Driver compatibility flag */ + CB_REV, /* version */ + nodev, /* async read */ + nodev, /* async write */ +}; + +static struct dev_ops zfs_dev_ops = { + DEVO_REV, /* version */ + 0, /* refcnt */ + zfs_info, /* info */ + nulldev, /* identify */ + nulldev, /* probe */ + zfs_attach, /* attach */ + zfs_detach, /* detach */ + nodev, /* reset */ + &zfs_cb_ops, /* driver operations */ + NULL /* no bus operations */ +}; + +static struct modldrv zfs_modldrv = { + &mod_driverops, "ZFS storage pool version " SPA_VERSION_STRING, + &zfs_dev_ops +}; + +static struct modlinkage modlinkage = { + MODREV_1, + (void *)&zfs_modlfs, + (void *)&zfs_modldrv, + NULL +}; + + +uint_t zfs_fsyncer_key; +extern uint_t rrw_tsd_key; + +int +_init(void) +{ + int error; + + spa_init(FREAD | FWRITE); + zfs_init(); + zvol_init(); + + if ((error = mod_install(&modlinkage)) != 0) { + zvol_fini(); + zfs_fini(); + spa_fini(); + return (error); + } + + tsd_create(&zfs_fsyncer_key, NULL); + tsd_create(&rrw_tsd_key, NULL); + + error = ldi_ident_from_mod(&modlinkage, &zfs_li); + ASSERT(error == 0); + mutex_init(&zfs_share_lock, NULL, MUTEX_DEFAULT, NULL); + + return (0); +} + +int +_fini(void) +{ + int error; + + if (spa_busy() || zfs_busy() || zvol_busy() || zio_injection_enabled) + return (EBUSY); + + if ((error = mod_remove(&modlinkage)) != 0) + return (error); + + zvol_fini(); + zfs_fini(); + spa_fini(); + if (zfs_nfsshare_inited) + (void) ddi_modclose(nfs_mod); + if (zfs_smbshare_inited) + (void) ddi_modclose(smbsrv_mod); + if (zfs_nfsshare_inited || zfs_smbshare_inited) + (void) ddi_modclose(sharefs_mod); + + tsd_destroy(&zfs_fsyncer_key); + ldi_ident_release(zfs_li); + zfs_li = NULL; + mutex_destroy(&zfs_share_lock); + + return (error); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} diff --git a/zfs/lib/libdmu-ctl/zfs_log.c b/zfs/lib/libdmu-ctl/zfs_log.c new file mode 100644 index 0000000000..3643858084 --- /dev/null +++ b/zfs/lib/libdmu-ctl/zfs_log.c @@ -0,0 +1,693 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "@(#)zfs_log.c 1.13 08/04/09 SMI" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * All the functions in this file are used to construct the log entries + * to record transactions. They allocate * an intent log transaction + * structure (itx_t) and save within it all the information necessary to + * possibly replay the transaction. The itx is then assigned a sequence + * number and inserted in the in-memory list anchored in the zilog. + */ + +int +zfs_log_create_txtype(zil_create_t type, vsecattr_t *vsecp, vattr_t *vap) +{ + int isxvattr = (vap->va_mask & AT_XVATTR); + switch (type) { + case Z_FILE: + if (vsecp == NULL && !isxvattr) + return (TX_CREATE); + if (vsecp && isxvattr) + return (TX_CREATE_ACL_ATTR); + if (vsecp) + return (TX_CREATE_ACL); + else + return (TX_CREATE_ATTR); + /*NOTREACHED*/ + case Z_DIR: + if (vsecp == NULL && !isxvattr) + return (TX_MKDIR); + if (vsecp && isxvattr) + return (TX_MKDIR_ACL_ATTR); + if (vsecp) + return (TX_MKDIR_ACL); + else + return (TX_MKDIR_ATTR); + case Z_XATTRDIR: + return (TX_MKXATTR); + } + ASSERT(0); + return (TX_MAX_TYPE); +} + +/* + * build up the log data necessary for logging xvattr_t + * First lr_attr_t is initialized. following the lr_attr_t + * is the mapsize and attribute bitmap copied from the xvattr_t. + * Following the bitmap and bitmapsize two 64 bit words are reserved + * for the create time which may be set. Following the create time + * records a single 64 bit integer which has the bits to set on + * replay for the xvattr. + */ +static void +zfs_log_xvattr(lr_attr_t *lrattr, xvattr_t *xvap) +{ + uint32_t *bitmap; + uint64_t *attrs; + uint64_t *crtime; + xoptattr_t *xoap; + void *scanstamp; + int i; + + xoap = xva_getxoptattr(xvap); + ASSERT(xoap); + + lrattr->lr_attr_masksize = xvap->xva_mapsize; + bitmap = &lrattr->lr_attr_bitmap; + for (i = 0; i != xvap->xva_mapsize; i++, bitmap++) { + *bitmap = xvap->xva_reqattrmap[i]; + } + + /* Now pack the attributes up in a single uint64_t */ + attrs = (uint64_t *)bitmap; + crtime = attrs + 1; + scanstamp = (caddr_t)(crtime + 2); + *attrs = 0; + if (XVA_ISSET_REQ(xvap, XAT_READONLY)) + *attrs |= (xoap->xoa_readonly == 0) ? 0 : + XAT0_READONLY; + if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) + *attrs |= (xoap->xoa_hidden == 0) ? 0 : + XAT0_HIDDEN; + if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) + *attrs |= (xoap->xoa_system == 0) ? 0 : + XAT0_SYSTEM; + if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) + *attrs |= (xoap->xoa_archive == 0) ? 0 : + XAT0_ARCHIVE; + if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) + *attrs |= (xoap->xoa_immutable == 0) ? 0 : + XAT0_IMMUTABLE; + if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) + *attrs |= (xoap->xoa_nounlink == 0) ? 0 : + XAT0_NOUNLINK; + if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) + *attrs |= (xoap->xoa_appendonly == 0) ? 0 : + XAT0_APPENDONLY; + if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) + *attrs |= (xoap->xoa_opaque == 0) ? 0 : + XAT0_APPENDONLY; + if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) + *attrs |= (xoap->xoa_nodump == 0) ? 0 : + XAT0_NODUMP; + if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) + *attrs |= (xoap->xoa_av_quarantined == 0) ? 0 : + XAT0_AV_QUARANTINED; + if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) + *attrs |= (xoap->xoa_av_modified == 0) ? 0 : + XAT0_AV_MODIFIED; + if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) + ZFS_TIME_ENCODE(&xoap->xoa_createtime, crtime); + if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) + bcopy(xoap->xoa_av_scanstamp, scanstamp, AV_SCANSTAMP_SZ); +} + +static void * +zfs_log_fuid_ids(zfs_fuid_info_t *fuidp, void *start) +{ + zfs_fuid_t *zfuid; + uint64_t *fuidloc = start; + + /* First copy in the ACE FUIDs */ + for (zfuid = list_head(&fuidp->z_fuids); zfuid; + zfuid = list_next(&fuidp->z_fuids, zfuid)) { + *fuidloc++ = zfuid->z_logfuid; + } + return (fuidloc); +} + + +static void * +zfs_log_fuid_domains(zfs_fuid_info_t *fuidp, void *start) +{ + zfs_fuid_domain_t *zdomain; + + /* now copy in the domain info, if any */ + if (fuidp->z_domain_str_sz != 0) { + for (zdomain = list_head(&fuidp->z_domains); zdomain; + zdomain = list_next(&fuidp->z_domains, zdomain)) { + bcopy((void *)zdomain->z_domain, start, + strlen(zdomain->z_domain) + 1); + start = (caddr_t)start + + strlen(zdomain->z_domain) + 1; + } + } + return (start); +} + +/* + * zfs_log_create() is used to handle TX_CREATE, TX_CREATE_ATTR, TX_MKDIR, + * TX_MKDIR_ATTR and TX_MKXATTR + * transactions. + * + * TX_CREATE and TX_MKDIR are standard creates, but they may have FUID + * domain information appended prior to the name. In this case the + * uid/gid in the log record will be a log centric FUID. + * + * TX_CREATE_ACL_ATTR and TX_MKDIR_ACL_ATTR handle special creates that + * may contain attributes, ACL and optional fuid information. + * + * TX_CREATE_ACL and TX_MKDIR_ACL handle special creates that specify + * and ACL and normal users/groups in the ACEs. + * + * There may be an optional xvattr attribute information similar + * to zfs_log_setattr. + * + * Also, after the file name "domain" strings may be appended. + */ +void +zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, + znode_t *dzp, znode_t *zp, char *name, vsecattr_t *vsecp, + zfs_fuid_info_t *fuidp, vattr_t *vap) +{ + itx_t *itx; + uint64_t seq; + lr_create_t *lr; + lr_acl_create_t *lracl; + size_t aclsize; + size_t xvatsize = 0; + size_t txsize; + xvattr_t *xvap = (xvattr_t *)vap; + void *end; + size_t lrsize; + + size_t namesize = strlen(name) + 1; + size_t fuidsz = 0; + + if (zilog == NULL) + return; + + /* + * If we have FUIDs present then add in space for + * domains and ACE fuid's if any. + */ + if (fuidp) { + fuidsz += fuidp->z_domain_str_sz; + fuidsz += fuidp->z_fuid_cnt * sizeof (uint64_t); + } + + if (vap->va_mask & AT_XVATTR) + xvatsize = ZIL_XVAT_SIZE(xvap->xva_mapsize); + + if ((int)txtype == TX_CREATE_ATTR || (int)txtype == TX_MKDIR_ATTR || + (int)txtype == TX_CREATE || (int)txtype == TX_MKDIR || + (int)txtype == TX_MKXATTR) { + txsize = sizeof (*lr) + namesize + fuidsz + xvatsize; + lrsize = sizeof (*lr); + } else { + aclsize = (vsecp) ? vsecp->vsa_aclentsz : 0; + txsize = + sizeof (lr_acl_create_t) + namesize + fuidsz + + ZIL_ACE_LENGTH(aclsize) + xvatsize; + lrsize = sizeof (lr_acl_create_t); + } + + itx = zil_itx_create(txtype, txsize); + + lr = (lr_create_t *)&itx->itx_lr; + lr->lr_doid = dzp->z_id; + lr->lr_foid = zp->z_id; + lr->lr_mode = zp->z_phys->zp_mode; + if (!IS_EPHEMERAL(zp->z_phys->zp_uid)) { + lr->lr_uid = (uint64_t)zp->z_phys->zp_uid; + } else { + lr->lr_uid = fuidp->z_fuid_owner; + } + if (!IS_EPHEMERAL(zp->z_phys->zp_gid)) { + lr->lr_gid = (uint64_t)zp->z_phys->zp_gid; + } else { + lr->lr_gid = fuidp->z_fuid_group; + } + lr->lr_gen = zp->z_phys->zp_gen; + lr->lr_crtime[0] = zp->z_phys->zp_crtime[0]; + lr->lr_crtime[1] = zp->z_phys->zp_crtime[1]; + lr->lr_rdev = zp->z_phys->zp_rdev; + + /* + * Fill in xvattr info if any + */ + if (vap->va_mask & AT_XVATTR) { + zfs_log_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), xvap); + end = (caddr_t)lr + lrsize + xvatsize; + } else { + end = (caddr_t)lr + lrsize; + } + + /* Now fill in any ACL info */ + + if (vsecp) { + lracl = (lr_acl_create_t *)&itx->itx_lr; + lracl->lr_aclcnt = vsecp->vsa_aclcnt; + lracl->lr_acl_bytes = aclsize; + lracl->lr_domcnt = fuidp ? fuidp->z_domain_cnt : 0; + lracl->lr_fuidcnt = fuidp ? fuidp->z_fuid_cnt : 0; + if (vsecp->vsa_aclflags & VSA_ACE_ACLFLAGS) + lracl->lr_acl_flags = (uint64_t)vsecp->vsa_aclflags; + else + lracl->lr_acl_flags = 0; + + bcopy(vsecp->vsa_aclentp, end, aclsize); + end = (caddr_t)end + ZIL_ACE_LENGTH(aclsize); + } + + /* drop in FUID info */ + if (fuidp) { + end = zfs_log_fuid_ids(fuidp, end); + end = zfs_log_fuid_domains(fuidp, end); + } + /* + * Now place file name in log record + */ + bcopy(name, end, namesize); + + seq = zil_itx_assign(zilog, itx, tx); + dzp->z_last_itx = seq; + zp->z_last_itx = seq; +} + +/* + * zfs_log_remove() handles both TX_REMOVE and TX_RMDIR transactions. + */ +void +zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, + znode_t *dzp, char *name) +{ + itx_t *itx; + uint64_t seq; + lr_remove_t *lr; + size_t namesize = strlen(name) + 1; + + if (zilog == NULL) + return; + + itx = zil_itx_create(txtype, sizeof (*lr) + namesize); + lr = (lr_remove_t *)&itx->itx_lr; + lr->lr_doid = dzp->z_id; + bcopy(name, (char *)(lr + 1), namesize); + + seq = zil_itx_assign(zilog, itx, tx); + dzp->z_last_itx = seq; +} + +/* + * zfs_log_link() handles TX_LINK transactions. + */ +void +zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, + znode_t *dzp, znode_t *zp, char *name) +{ + itx_t *itx; + uint64_t seq; + lr_link_t *lr; + size_t namesize = strlen(name) + 1; + + if (zilog == NULL) + return; + + itx = zil_itx_create(txtype, sizeof (*lr) + namesize); + lr = (lr_link_t *)&itx->itx_lr; + lr->lr_doid = dzp->z_id; + lr->lr_link_obj = zp->z_id; + bcopy(name, (char *)(lr + 1), namesize); + + seq = zil_itx_assign(zilog, itx, tx); + dzp->z_last_itx = seq; + zp->z_last_itx = seq; +} + +/* + * zfs_log_symlink() handles TX_SYMLINK transactions. + */ +void +zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, + znode_t *dzp, znode_t *zp, char *name, char *link) +{ + itx_t *itx; + uint64_t seq; + lr_create_t *lr; + size_t namesize = strlen(name) + 1; + size_t linksize = strlen(link) + 1; + + if (zilog == NULL) + return; + + itx = zil_itx_create(txtype, sizeof (*lr) + namesize + linksize); + lr = (lr_create_t *)&itx->itx_lr; + lr->lr_doid = dzp->z_id; + lr->lr_foid = zp->z_id; + lr->lr_mode = zp->z_phys->zp_mode; + lr->lr_uid = zp->z_phys->zp_uid; + lr->lr_gid = zp->z_phys->zp_gid; + lr->lr_gen = zp->z_phys->zp_gen; + lr->lr_crtime[0] = zp->z_phys->zp_crtime[0]; + lr->lr_crtime[1] = zp->z_phys->zp_crtime[1]; + bcopy(name, (char *)(lr + 1), namesize); + bcopy(link, (char *)(lr + 1) + namesize, linksize); + + seq = zil_itx_assign(zilog, itx, tx); + dzp->z_last_itx = seq; + zp->z_last_itx = seq; +} + +/* + * zfs_log_rename() handles TX_RENAME transactions. + */ +void +zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, + znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp) +{ + itx_t *itx; + uint64_t seq; + lr_rename_t *lr; + size_t snamesize = strlen(sname) + 1; + size_t dnamesize = strlen(dname) + 1; + + if (zilog == NULL) + return; + + itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize); + lr = (lr_rename_t *)&itx->itx_lr; + lr->lr_sdoid = sdzp->z_id; + lr->lr_tdoid = tdzp->z_id; + bcopy(sname, (char *)(lr + 1), snamesize); + bcopy(dname, (char *)(lr + 1) + snamesize, dnamesize); + + seq = zil_itx_assign(zilog, itx, tx); + sdzp->z_last_itx = seq; + tdzp->z_last_itx = seq; + szp->z_last_itx = seq; +} + +/* + * zfs_log_write() handles TX_WRITE transactions. + */ +ssize_t zfs_immediate_write_sz = 32768; + +#define ZIL_MAX_LOG_DATA (SPA_MAXBLOCKSIZE - sizeof (zil_trailer_t) - \ + sizeof (lr_write_t)) + +void +zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *zp, offset_t off, ssize_t resid, int ioflag) +{ + itx_wr_state_t write_state; + boolean_t slogging; + uintptr_t fsync_cnt; + + if (zilog == NULL || zp->z_unlinked) + return; + + /* + * Writes are handled in three different ways: + * + * WR_INDIRECT: + * If the write is greater than zfs_immediate_write_sz and there are + * no separate logs in this pool then later *if* we need to log the + * write then dmu_sync() is used to immediately write the block and + * its block pointer is put in the log record. + * WR_COPIED: + * If we know we'll immediately be committing the + * transaction (FSYNC or FDSYNC), the we allocate a larger + * log record here for the data and copy the data in. + * WR_NEED_COPY: + * Otherwise we don't allocate a buffer, and *if* we need to + * flush the write later then a buffer is allocated and + * we retrieve the data using the dmu. + */ + slogging = spa_has_slogs(zilog->zl_spa); + if (resid > zfs_immediate_write_sz && !slogging) + write_state = WR_INDIRECT; + else if (ioflag & (FSYNC | FDSYNC)) + write_state = WR_COPIED; + else + write_state = WR_NEED_COPY; + + if ((fsync_cnt = (uintptr_t)tsd_get(zfs_fsyncer_key)) != 0) { + (void) tsd_set(zfs_fsyncer_key, (void *)(fsync_cnt - 1)); + } + + while (resid) { + itx_t *itx; + lr_write_t *lr; + ssize_t len; + + /* + * If there are slogs and the write would overflow the largest + * block, then because we don't want to use the main pool + * to dmu_sync, we have to split the write. + */ + if (slogging && resid > ZIL_MAX_LOG_DATA) + len = SPA_MAXBLOCKSIZE >> 1; + else + len = resid; + + itx = zil_itx_create(txtype, sizeof (*lr) + + (write_state == WR_COPIED ? len : 0)); + lr = (lr_write_t *)&itx->itx_lr; + if (write_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os, + zp->z_id, off, len, lr + 1) != 0) { + kmem_free(itx, offsetof(itx_t, itx_lr) + + itx->itx_lr.lrc_reclen); + itx = zil_itx_create(txtype, sizeof (*lr)); + lr = (lr_write_t *)&itx->itx_lr; + write_state = WR_NEED_COPY; + } + + itx->itx_wr_state = write_state; + if (write_state == WR_NEED_COPY) + itx->itx_sod += len; + lr->lr_foid = zp->z_id; + lr->lr_offset = off; + lr->lr_length = len; + lr->lr_blkoff = 0; + BP_ZERO(&lr->lr_blkptr); + + itx->itx_private = zp->z_zfsvfs; + + if ((zp->z_sync_cnt != 0) || (fsync_cnt != 0) || + (ioflag & (FSYNC | FDSYNC))) + itx->itx_sync = B_TRUE; + else + itx->itx_sync = B_FALSE; + + zp->z_last_itx = zil_itx_assign(zilog, itx, tx); + + off += len; + resid -= len; + } +} + +/* + * zfs_log_truncate() handles TX_TRUNCATE transactions. + */ +void +zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *zp, uint64_t off, uint64_t len) +{ + itx_t *itx; + uint64_t seq; + lr_truncate_t *lr; + + if (zilog == NULL || zp->z_unlinked) + return; + + itx = zil_itx_create(txtype, sizeof (*lr)); + lr = (lr_truncate_t *)&itx->itx_lr; + lr->lr_foid = zp->z_id; + lr->lr_offset = off; + lr->lr_length = len; + + itx->itx_sync = (zp->z_sync_cnt != 0); + seq = zil_itx_assign(zilog, itx, tx); + zp->z_last_itx = seq; +} + +/* + * zfs_log_setattr() handles TX_SETATTR transactions. + */ +void +zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp) +{ + itx_t *itx; + uint64_t seq; + lr_setattr_t *lr; + xvattr_t *xvap = (xvattr_t *)vap; + size_t recsize = sizeof (lr_setattr_t); + void *start; + + + if (zilog == NULL || zp->z_unlinked) + return; + + /* + * If XVATTR set, then log record size needs to allow + * for lr_attr_t + xvattr mask, mapsize and create time + * plus actual attribute values + */ + if (vap->va_mask & AT_XVATTR) + recsize = sizeof (*lr) + ZIL_XVAT_SIZE(xvap->xva_mapsize); + + if (fuidp) + recsize += fuidp->z_domain_str_sz; + + itx = zil_itx_create(txtype, recsize); + lr = (lr_setattr_t *)&itx->itx_lr; + lr->lr_foid = zp->z_id; + lr->lr_mask = (uint64_t)mask_applied; + lr->lr_mode = (uint64_t)vap->va_mode; + if ((mask_applied & AT_UID) && IS_EPHEMERAL(vap->va_uid)) + lr->lr_uid = fuidp->z_fuid_owner; + else + lr->lr_uid = (uint64_t)vap->va_uid; + + if ((mask_applied & AT_GID) && IS_EPHEMERAL(vap->va_gid)) + lr->lr_gid = fuidp->z_fuid_group; + else + lr->lr_gid = (uint64_t)vap->va_gid; + + lr->lr_size = (uint64_t)vap->va_size; + ZFS_TIME_ENCODE(&vap->va_atime, lr->lr_atime); + ZFS_TIME_ENCODE(&vap->va_mtime, lr->lr_mtime); + start = (lr_setattr_t *)(lr + 1); + if (vap->va_mask & AT_XVATTR) { + zfs_log_xvattr((lr_attr_t *)start, xvap); + start = (caddr_t)start + ZIL_XVAT_SIZE(xvap->xva_mapsize); + } + + /* + * Now stick on domain information if any on end + */ + + if (fuidp) + (void) zfs_log_fuid_domains(fuidp, start); + + itx->itx_sync = (zp->z_sync_cnt != 0); + seq = zil_itx_assign(zilog, itx, tx); + zp->z_last_itx = seq; +} + +/* + * zfs_log_acl() handles TX_ACL transactions. + */ +void +zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp, + vsecattr_t *vsecp, zfs_fuid_info_t *fuidp) +{ + itx_t *itx; + uint64_t seq; + lr_acl_v0_t *lrv0; + lr_acl_t *lr; + int txtype; + int lrsize; + size_t txsize; + size_t aclbytes = vsecp->vsa_aclentsz; + + txtype = (zp->z_zfsvfs->z_version == ZPL_VERSION_INITIAL) ? + TX_ACL_V0 : TX_ACL; + + if (txtype == TX_ACL) + lrsize = sizeof (*lr); + else + lrsize = sizeof (*lrv0); + + if (zilog == NULL || zp->z_unlinked) + return; + + txsize = lrsize + + ((txtype == TX_ACL) ? ZIL_ACE_LENGTH(aclbytes) : aclbytes) + + (fuidp ? fuidp->z_domain_str_sz : 0) + + sizeof (uint64) * (fuidp ? fuidp->z_fuid_cnt : 0); + + itx = zil_itx_create(txtype, txsize); + + lr = (lr_acl_t *)&itx->itx_lr; + lr->lr_foid = zp->z_id; + if (txtype == TX_ACL) { + lr->lr_acl_bytes = aclbytes; + lr->lr_domcnt = fuidp ? fuidp->z_domain_cnt : 0; + lr->lr_fuidcnt = fuidp ? fuidp->z_fuid_cnt : 0; + if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS) + lr->lr_acl_flags = (uint64_t)vsecp->vsa_aclflags; + else + lr->lr_acl_flags = 0; + } + lr->lr_aclcnt = (uint64_t)vsecp->vsa_aclcnt; + + if (txtype == TX_ACL_V0) { + lrv0 = (lr_acl_v0_t *)lr; + bcopy(vsecp->vsa_aclentp, (ace_t *)(lrv0 + 1), aclbytes); + } else { + void *start = (ace_t *)(lr + 1); + + bcopy(vsecp->vsa_aclentp, start, aclbytes); + + start = (caddr_t)start + ZIL_ACE_LENGTH(aclbytes); + + if (fuidp) { + start = zfs_log_fuid_ids(fuidp, start); + (void) zfs_log_fuid_domains(fuidp, start); + } + } + + itx->itx_sync = (zp->z_sync_cnt != 0); + seq = zil_itx_assign(zilog, itx, tx); + zp->z_last_itx = seq; +} diff --git a/zfs/lib/libdmu-ctl/zfs_replay.c b/zfs/lib/libdmu-ctl/zfs_replay.c new file mode 100644 index 0000000000..ca9990d7c2 --- /dev/null +++ b/zfs/lib/libdmu-ctl/zfs_replay.c @@ -0,0 +1,876 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "@(#)zfs_replay.c 1.7 08/01/14 SMI" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Functions to replay ZFS intent log (ZIL) records + * The functions are called through a function vector (zfs_replay_vector) + * which is indexed by the transaction type. + */ + +static void +zfs_init_vattr(vattr_t *vap, uint64_t mask, uint64_t mode, + uint64_t uid, uint64_t gid, uint64_t rdev, uint64_t nodeid) +{ + bzero(vap, sizeof (*vap)); + vap->va_mask = (uint_t)mask; + vap->va_type = IFTOVT(mode); + vap->va_mode = mode & MODEMASK; + vap->va_uid = (uid_t)(IS_EPHEMERAL(uid)) ? -1 : uid; + vap->va_gid = (gid_t)(IS_EPHEMERAL(gid)) ? -1 : gid; + vap->va_rdev = zfs_cmpldev(rdev); + vap->va_nodeid = nodeid; +} + +/* ARGSUSED */ +static int +zfs_replay_error(zfsvfs_t *zfsvfs, lr_t *lr, boolean_t byteswap) +{ + return (ENOTSUP); +} + +static void +zfs_replay_xvattr(lr_attr_t *lrattr, xvattr_t *xvap) +{ + xoptattr_t *xoap = NULL; + uint64_t *attrs; + uint64_t *crtime; + uint32_t *bitmap; + void *scanstamp; + int i; + + xvap->xva_vattr.va_mask |= AT_XVATTR; + if ((xoap = xva_getxoptattr(xvap)) == NULL) { + xvap->xva_vattr.va_mask &= ~AT_XVATTR; /* shouldn't happen */ + return; + } + + ASSERT(lrattr->lr_attr_masksize == xvap->xva_mapsize); + + bitmap = &lrattr->lr_attr_bitmap; + for (i = 0; i != lrattr->lr_attr_masksize; i++, bitmap++) + xvap->xva_reqattrmap[i] = *bitmap; + + attrs = (uint64_t *)(lrattr + lrattr->lr_attr_masksize - 1); + crtime = attrs + 1; + scanstamp = (caddr_t)(crtime + 2); + + if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) + xoap->xoa_hidden = ((*attrs & XAT0_HIDDEN) != 0); + if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) + xoap->xoa_system = ((*attrs & XAT0_SYSTEM) != 0); + if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) + xoap->xoa_archive = ((*attrs & XAT0_ARCHIVE) != 0); + if (XVA_ISSET_REQ(xvap, XAT_READONLY)) + xoap->xoa_readonly = ((*attrs & XAT0_READONLY) != 0); + if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) + xoap->xoa_immutable = ((*attrs & XAT0_IMMUTABLE) != 0); + if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) + xoap->xoa_nounlink = ((*attrs & XAT0_NOUNLINK) != 0); + if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) + xoap->xoa_appendonly = ((*attrs & XAT0_APPENDONLY) != 0); + if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) + xoap->xoa_nodump = ((*attrs & XAT0_NODUMP) != 0); + if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) + xoap->xoa_opaque = ((*attrs & XAT0_OPAQUE) != 0); + if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) + xoap->xoa_av_modified = ((*attrs & XAT0_AV_MODIFIED) != 0); + if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) + xoap->xoa_av_quarantined = + ((*attrs & XAT0_AV_QUARANTINED) != 0); + if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) + ZFS_TIME_DECODE(&xoap->xoa_createtime, crtime); + if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) + bcopy(scanstamp, xoap->xoa_av_scanstamp, AV_SCANSTAMP_SZ); +} + +static int +zfs_replay_domain_cnt(uint64_t uid, uint64_t gid) +{ + uint64_t uid_idx; + uint64_t gid_idx; + int domcnt = 0; + + uid_idx = FUID_INDEX(uid); + gid_idx = FUID_INDEX(gid); + if (uid_idx) + domcnt++; + if (gid_idx > 0 && gid_idx != uid_idx) + domcnt++; + + return (domcnt); +} + +static void * +zfs_replay_fuid_domain_common(zfs_fuid_info_t *fuid_infop, void *start, + int domcnt) +{ + int i; + + for (i = 0; i != domcnt; i++) { + fuid_infop->z_domain_table[i] = start; + start = (caddr_t)start + strlen(start) + 1; + } + + return (start); +} + +/* + * Set the uid/gid in the fuid_info structure. + */ +static void +zfs_replay_fuid_ugid(zfs_fuid_info_t *fuid_infop, uint64_t uid, uint64_t gid) +{ + /* + * If owner or group are log specific FUIDs then slurp up + * domain information and build zfs_fuid_info_t + */ + if (IS_EPHEMERAL(uid)) + fuid_infop->z_fuid_owner = uid; + + if (IS_EPHEMERAL(gid)) + fuid_infop->z_fuid_group = gid; +} + +/* + * Load fuid domains into fuid_info_t + */ +static zfs_fuid_info_t * +zfs_replay_fuid_domain(void *buf, void **end, uint64_t uid, uint64_t gid) +{ + int domcnt; + + zfs_fuid_info_t *fuid_infop; + + fuid_infop = zfs_fuid_info_alloc(); + + domcnt = zfs_replay_domain_cnt(uid, gid); + + if (domcnt == 0) + return (fuid_infop); + + fuid_infop->z_domain_table = + kmem_zalloc(domcnt * sizeof (char **), KM_SLEEP); + + zfs_replay_fuid_ugid(fuid_infop, uid, gid); + + fuid_infop->z_domain_cnt = domcnt; + *end = zfs_replay_fuid_domain_common(fuid_infop, buf, domcnt); + return (fuid_infop); +} + +/* + * load zfs_fuid_t's and fuid_domains into fuid_info_t + */ +static zfs_fuid_info_t * +zfs_replay_fuids(void *start, void **end, int idcnt, int domcnt, uint64_t uid, + uint64_t gid) +{ + uint64_t *log_fuid = (uint64_t *)start; + zfs_fuid_info_t *fuid_infop; + int i; + + fuid_infop = zfs_fuid_info_alloc(); + fuid_infop->z_domain_cnt = domcnt; + + fuid_infop->z_domain_table = + kmem_zalloc(domcnt * sizeof (char **), KM_SLEEP); + + for (i = 0; i != idcnt; i++) { + zfs_fuid_t *zfuid; + + zfuid = kmem_alloc(sizeof (zfs_fuid_t), KM_SLEEP); + zfuid->z_logfuid = *log_fuid; + zfuid->z_id = -1; + zfuid->z_domidx = 0; + list_insert_tail(&fuid_infop->z_fuids, zfuid); + log_fuid++; + } + + zfs_replay_fuid_ugid(fuid_infop, uid, gid); + + *end = zfs_replay_fuid_domain_common(fuid_infop, log_fuid, domcnt); + return (fuid_infop); +} + +static void +zfs_replay_swap_attrs(lr_attr_t *lrattr) +{ + /* swap the lr_attr structure */ + byteswap_uint32_array(lrattr, sizeof (*lrattr)); + /* swap the bitmap */ + byteswap_uint32_array(lrattr + 1, (lrattr->lr_attr_masksize - 1) * + sizeof (uint32_t)); + /* swap the attributes, create time + 64 bit word for attributes */ + byteswap_uint64_array((caddr_t)(lrattr + 1) + (sizeof (uint32_t) * + (lrattr->lr_attr_masksize - 1)), 3 * sizeof (uint64_t)); +} + +/* + * Replay file create with optional ACL, xvattr information as well + * as option FUID information. + */ +static int +zfs_replay_create_acl(zfsvfs_t *zfsvfs, + lr_acl_create_t *lracl, boolean_t byteswap) +{ + char *name = NULL; /* location determined later */ + lr_create_t *lr = (lr_create_t *)lracl; + znode_t *dzp; + vnode_t *vp = NULL; + xvattr_t xva; + int vflg = 0; + vsecattr_t vsec = { 0 }; + lr_attr_t *lrattr; + void *aclstart; + void *fuidstart; + size_t xvatlen = 0; + uint64_t txtype; + int error; + + if (byteswap) { + byteswap_uint64_array(lracl, sizeof (*lracl)); + txtype = (int)lr->lr_common.lrc_txtype; + if (txtype == TX_CREATE_ACL_ATTR || + txtype == TX_MKDIR_ACL_ATTR) { + lrattr = (lr_attr_t *)(caddr_t)(lracl + 1); + zfs_replay_swap_attrs(lrattr); + xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); + } + + aclstart = (caddr_t)(lracl + 1) + xvatlen; + zfs_ace_byteswap(aclstart, lracl->lr_acl_bytes, B_FALSE); + /* swap fuids */ + if (lracl->lr_fuidcnt) { + byteswap_uint64_array((caddr_t)aclstart + + ZIL_ACE_LENGTH(lracl->lr_acl_bytes), + lracl->lr_fuidcnt * sizeof (uint64_t)); + } + } + + if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) + return (error); + + xva_init(&xva); + zfs_init_vattr(&xva.xva_vattr, AT_TYPE | AT_MODE | AT_UID | AT_GID, + lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, lr->lr_foid); + + /* + * All forms of zfs create (create, mkdir, mkxattrdir, symlink) + * eventually end up in zfs_mknode(), which assigns the object's + * creation time and generation number. The generic VOP_CREATE() + * doesn't have either concept, so we smuggle the values inside + * the vattr's otherwise unused va_ctime and va_nblocks fields. + */ + ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime); + xva.xva_vattr.va_nblocks = lr->lr_gen; + + error = dmu_object_info(zfsvfs->z_os, lr->lr_foid, NULL); + if (error != ENOENT) + goto bail; + + if (lr->lr_common.lrc_txtype & TX_CI) + vflg |= FIGNORECASE; + switch ((int)lr->lr_common.lrc_txtype) { + case TX_CREATE_ACL: + aclstart = (caddr_t)(lracl + 1); + fuidstart = (caddr_t)aclstart + + ZIL_ACE_LENGTH(lracl->lr_acl_bytes); + zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart, + (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, + lr->lr_uid, lr->lr_gid); + /*FALLTHROUGH*/ + case TX_CREATE_ACL_ATTR: + if (name == NULL) { + lrattr = (lr_attr_t *)(caddr_t)(lracl + 1); + xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); + xva.xva_vattr.va_mask |= AT_XVATTR; + zfs_replay_xvattr(lrattr, &xva); + } + vsec.vsa_mask = VSA_ACE | VSA_ACE_ACLFLAGS; + vsec.vsa_aclentp = (caddr_t)(lracl + 1) + xvatlen; + vsec.vsa_aclcnt = lracl->lr_aclcnt; + vsec.vsa_aclentsz = lracl->lr_acl_bytes; + vsec.vsa_aclflags = lracl->lr_acl_flags; + if (zfsvfs->z_fuid_replay == NULL) { + fuidstart = (caddr_t)(lracl + 1) + xvatlen + + ZIL_ACE_LENGTH(lracl->lr_acl_bytes); + zfsvfs->z_fuid_replay = + zfs_replay_fuids(fuidstart, + (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, + lr->lr_uid, lr->lr_gid); + } + + error = VOP_CREATE(ZTOV(dzp), name, &xva.xva_vattr, + 0, 0, &vp, kcred, vflg, NULL, &vsec); + break; + case TX_MKDIR_ACL: + aclstart = (caddr_t)(lracl + 1); + fuidstart = (caddr_t)aclstart + + ZIL_ACE_LENGTH(lracl->lr_acl_bytes); + zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart, + (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, + lr->lr_uid, lr->lr_gid); + /*FALLTHROUGH*/ + case TX_MKDIR_ACL_ATTR: + if (name == NULL) { + lrattr = (lr_attr_t *)(caddr_t)(lracl + 1); + xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); + zfs_replay_xvattr(lrattr, &xva); + } + vsec.vsa_mask = VSA_ACE | VSA_ACE_ACLFLAGS; + vsec.vsa_aclentp = (caddr_t)(lracl + 1) + xvatlen; + vsec.vsa_aclcnt = lracl->lr_aclcnt; + vsec.vsa_aclentsz = lracl->lr_acl_bytes; + vsec.vsa_aclflags = lracl->lr_acl_flags; + if (zfsvfs->z_fuid_replay == NULL) { + fuidstart = (caddr_t)(lracl + 1) + xvatlen + + ZIL_ACE_LENGTH(lracl->lr_acl_bytes); + zfsvfs->z_fuid_replay = + zfs_replay_fuids(fuidstart, + (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, + lr->lr_uid, lr->lr_gid); + } + error = VOP_MKDIR(ZTOV(dzp), name, &xva.xva_vattr, + &vp, kcred, NULL, vflg, &vsec); + break; + default: + error = ENOTSUP; + } + +bail: + if (error == 0 && vp != NULL) + VN_RELE(vp); + + VN_RELE(ZTOV(dzp)); + + zfs_fuid_info_free(zfsvfs->z_fuid_replay); + zfsvfs->z_fuid_replay = NULL; + + return (error); +} + +static int +zfs_replay_create(zfsvfs_t *zfsvfs, lr_create_t *lr, boolean_t byteswap) +{ + char *name = NULL; /* location determined later */ + char *link; /* symlink content follows name */ + znode_t *dzp; + vnode_t *vp = NULL; + xvattr_t xva; + int vflg = 0; + size_t lrsize = sizeof (lr_create_t); + lr_attr_t *lrattr; + void *start; + size_t xvatlen; + uint64_t txtype; + int error; + + if (byteswap) { + byteswap_uint64_array(lr, sizeof (*lr)); + txtype = (int)lr->lr_common.lrc_txtype; + if (txtype == TX_CREATE_ATTR || txtype == TX_MKDIR_ATTR) + zfs_replay_swap_attrs((lr_attr_t *)(lr + 1)); + } + + + if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) + return (error); + + xva_init(&xva); + zfs_init_vattr(&xva.xva_vattr, AT_TYPE | AT_MODE | AT_UID | AT_GID, + lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, lr->lr_foid); + + /* + * All forms of zfs create (create, mkdir, mkxattrdir, symlink) + * eventually end up in zfs_mknode(), which assigns the object's + * creation time and generation number. The generic VOP_CREATE() + * doesn't have either concept, so we smuggle the values inside + * the vattr's otherwise unused va_ctime and va_nblocks fields. + */ + ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime); + xva.xva_vattr.va_nblocks = lr->lr_gen; + + error = dmu_object_info(zfsvfs->z_os, lr->lr_foid, NULL); + if (error != ENOENT) + goto out; + + if (lr->lr_common.lrc_txtype & TX_CI) + vflg |= FIGNORECASE; + + /* + * Symlinks don't have fuid info, and CIFS never creates + * symlinks. + * + * The _ATTR versions will grab the fuid info in their subcases. + */ + if ((int)lr->lr_common.lrc_txtype != TX_SYMLINK && + (int)lr->lr_common.lrc_txtype != TX_MKDIR_ATTR && + (int)lr->lr_common.lrc_txtype != TX_CREATE_ATTR) { + start = (lr + 1); + zfsvfs->z_fuid_replay = + zfs_replay_fuid_domain(start, &start, + lr->lr_uid, lr->lr_gid); + } + + switch ((int)lr->lr_common.lrc_txtype) { + case TX_CREATE_ATTR: + lrattr = (lr_attr_t *)(caddr_t)(lr + 1); + xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); + zfs_replay_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), &xva); + start = (caddr_t)(lr + 1) + xvatlen; + zfsvfs->z_fuid_replay = + zfs_replay_fuid_domain(start, &start, + lr->lr_uid, lr->lr_gid); + name = (char *)start; + + /*FALLTHROUGH*/ + case TX_CREATE: + if (name == NULL) + name = (char *)start; + + error = VOP_CREATE(ZTOV(dzp), name, &xva.xva_vattr, + 0, 0, &vp, kcred, vflg, NULL, NULL); + break; + case TX_MKDIR_ATTR: + lrattr = (lr_attr_t *)(caddr_t)(lr + 1); + xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); + zfs_replay_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), &xva); + start = (caddr_t)(lr + 1) + xvatlen; + zfsvfs->z_fuid_replay = + zfs_replay_fuid_domain(start, &start, + lr->lr_uid, lr->lr_gid); + name = (char *)start; + + /*FALLTHROUGH*/ + case TX_MKDIR: + if (name == NULL) + name = (char *)(lr + 1); + + error = VOP_MKDIR(ZTOV(dzp), name, &xva.xva_vattr, + &vp, kcred, NULL, vflg, NULL); + break; + case TX_MKXATTR: + name = (char *)(lr + 1); + error = zfs_make_xattrdir(dzp, &xva.xva_vattr, &vp, kcred); + break; + case TX_SYMLINK: + name = (char *)(lr + 1); + link = name + strlen(name) + 1; + error = VOP_SYMLINK(ZTOV(dzp), name, &xva.xva_vattr, + link, kcred, NULL, vflg); + break; + default: + error = ENOTSUP; + } + +out: + if (error == 0 && vp != NULL) + VN_RELE(vp); + + VN_RELE(ZTOV(dzp)); + + if (zfsvfs->z_fuid_replay) + zfs_fuid_info_free(zfsvfs->z_fuid_replay); + zfsvfs->z_fuid_replay = NULL; + return (error); +} + +static int +zfs_replay_remove(zfsvfs_t *zfsvfs, lr_remove_t *lr, boolean_t byteswap) +{ + char *name = (char *)(lr + 1); /* name follows lr_remove_t */ + znode_t *dzp; + int error; + int vflg = 0; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) + return (error); + + if (lr->lr_common.lrc_txtype & TX_CI) + vflg |= FIGNORECASE; + + switch ((int)lr->lr_common.lrc_txtype) { + case TX_REMOVE: + error = VOP_REMOVE(ZTOV(dzp), name, kcred, NULL, vflg); + break; + case TX_RMDIR: + error = VOP_RMDIR(ZTOV(dzp), name, NULL, kcred, NULL, vflg); + break; + default: + error = ENOTSUP; + } + + VN_RELE(ZTOV(dzp)); + + return (error); +} + +static int +zfs_replay_link(zfsvfs_t *zfsvfs, lr_link_t *lr, boolean_t byteswap) +{ + char *name = (char *)(lr + 1); /* name follows lr_link_t */ + znode_t *dzp, *zp; + int error; + int vflg = 0; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) + return (error); + + if ((error = zfs_zget(zfsvfs, lr->lr_link_obj, &zp)) != 0) { + VN_RELE(ZTOV(dzp)); + return (error); + } + + if (lr->lr_common.lrc_txtype & TX_CI) + vflg |= FIGNORECASE; + + error = VOP_LINK(ZTOV(dzp), ZTOV(zp), name, kcred, NULL, vflg); + + VN_RELE(ZTOV(zp)); + VN_RELE(ZTOV(dzp)); + + return (error); +} + +static int +zfs_replay_rename(zfsvfs_t *zfsvfs, lr_rename_t *lr, boolean_t byteswap) +{ + char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */ + char *tname = sname + strlen(sname) + 1; + znode_t *sdzp, *tdzp; + int error; + int vflg = 0; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + if ((error = zfs_zget(zfsvfs, lr->lr_sdoid, &sdzp)) != 0) + return (error); + + if ((error = zfs_zget(zfsvfs, lr->lr_tdoid, &tdzp)) != 0) { + VN_RELE(ZTOV(sdzp)); + return (error); + } + + if (lr->lr_common.lrc_txtype & TX_CI) + vflg |= FIGNORECASE; + + error = VOP_RENAME(ZTOV(sdzp), sname, ZTOV(tdzp), tname, kcred, + NULL, vflg); + + VN_RELE(ZTOV(tdzp)); + VN_RELE(ZTOV(sdzp)); + + return (error); +} + +static int +zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap) +{ + char *data = (char *)(lr + 1); /* data follows lr_write_t */ + znode_t *zp; + int error; + ssize_t resid; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) { + /* + * As we can log writes out of order, it's possible the + * file has been removed. In this case just drop the write + * and return success. + */ + if (error == ENOENT) + error = 0; + return (error); + } + + error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, lr->lr_length, + lr->lr_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); + + VN_RELE(ZTOV(zp)); + + return (error); +} + +static int +zfs_replay_truncate(zfsvfs_t *zfsvfs, lr_truncate_t *lr, boolean_t byteswap) +{ + znode_t *zp; + flock64_t fl; + int error; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) { + /* + * As we can log truncates out of order, it's possible the + * file has been removed. In this case just drop the truncate + * and return success. + */ + if (error == ENOENT) + error = 0; + return (error); + } + + bzero(&fl, sizeof (fl)); + fl.l_type = F_WRLCK; + fl.l_whence = 0; + fl.l_start = lr->lr_offset; + fl.l_len = lr->lr_length; + + error = VOP_SPACE(ZTOV(zp), F_FREESP, &fl, FWRITE | FOFFMAX, + lr->lr_offset, kcred, NULL); + + VN_RELE(ZTOV(zp)); + + return (error); +} + +static int +zfs_replay_setattr(zfsvfs_t *zfsvfs, lr_setattr_t *lr, boolean_t byteswap) +{ + znode_t *zp; + xvattr_t xva; + vattr_t *vap = &xva.xva_vattr; + int error; + void *start; + + xva_init(&xva); + if (byteswap) { + byteswap_uint64_array(lr, sizeof (*lr)); + + if ((lr->lr_mask & AT_XVATTR) && + zfsvfs->z_version >= ZPL_VERSION_INITIAL) + zfs_replay_swap_attrs((lr_attr_t *)(lr + 1)); + } + + if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) { + /* + * As we can log setattrs out of order, it's possible the + * file has been removed. In this case just drop the setattr + * and return success. + */ + if (error == ENOENT) + error = 0; + return (error); + } + + zfs_init_vattr(vap, lr->lr_mask, lr->lr_mode, + lr->lr_uid, lr->lr_gid, 0, lr->lr_foid); + + vap->va_size = lr->lr_size; + ZFS_TIME_DECODE(&vap->va_atime, lr->lr_atime); + ZFS_TIME_DECODE(&vap->va_mtime, lr->lr_mtime); + + /* + * Fill in xvattr_t portions if necessary. + */ + + start = (lr_setattr_t *)(lr + 1); + if (vap->va_mask & AT_XVATTR) { + zfs_replay_xvattr((lr_attr_t *)start, &xva); + start = (caddr_t)start + + ZIL_XVAT_SIZE(((lr_attr_t *)start)->lr_attr_masksize); + } else + xva.xva_vattr.va_mask &= ~AT_XVATTR; + + zfsvfs->z_fuid_replay = zfs_replay_fuid_domain(start, &start, + lr->lr_uid, lr->lr_gid); + + error = VOP_SETATTR(ZTOV(zp), vap, 0, kcred, NULL); + + zfs_fuid_info_free(zfsvfs->z_fuid_replay); + zfsvfs->z_fuid_replay = NULL; + VN_RELE(ZTOV(zp)); + + return (error); +} + +static int +zfs_replay_acl_v0(zfsvfs_t *zfsvfs, lr_acl_v0_t *lr, boolean_t byteswap) +{ + ace_t *ace = (ace_t *)(lr + 1); /* ace array follows lr_acl_t */ + vsecattr_t vsa; + znode_t *zp; + int error; + + if (byteswap) { + byteswap_uint64_array(lr, sizeof (*lr)); + zfs_oldace_byteswap(ace, lr->lr_aclcnt); + } + + if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) { + /* + * As we can log acls out of order, it's possible the + * file has been removed. In this case just drop the acl + * and return success. + */ + if (error == ENOENT) + error = 0; + return (error); + } + + bzero(&vsa, sizeof (vsa)); + vsa.vsa_mask = VSA_ACE | VSA_ACECNT; + vsa.vsa_aclcnt = lr->lr_aclcnt; + vsa.vsa_aclentp = ace; + + error = VOP_SETSECATTR(ZTOV(zp), &vsa, 0, kcred, NULL); + + VN_RELE(ZTOV(zp)); + + return (error); +} + +/* + * Replaying ACLs is complicated by FUID support. + * The log record may contain some optional data + * to be used for replaying FUID's. These pieces + * are the actual FUIDs that were created initially. + * The FUID table index may no longer be valid and + * during zfs_create() a new index may be assigned. + * Because of this the log will contain the original + * doman+rid in order to create a new FUID. + * + * The individual ACEs may contain an ephemeral uid/gid which is no + * longer valid and will need to be replaced with an actual FUID. + * + */ +static int +zfs_replay_acl(zfsvfs_t *zfsvfs, lr_acl_t *lr, boolean_t byteswap) +{ + ace_t *ace = (ace_t *)(lr + 1); + vsecattr_t vsa; + znode_t *zp; + int error; + + if (byteswap) { + byteswap_uint64_array(lr, sizeof (*lr)); + zfs_ace_byteswap(ace, lr->lr_acl_bytes, B_FALSE); + if (lr->lr_fuidcnt) { + byteswap_uint64_array((caddr_t)ace + + ZIL_ACE_LENGTH(lr->lr_acl_bytes), + lr->lr_fuidcnt * sizeof (uint64_t)); + } + } + + if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) { + /* + * As we can log acls out of order, it's possible the + * file has been removed. In this case just drop the acl + * and return success. + */ + if (error == ENOENT) + error = 0; + return (error); + } + + bzero(&vsa, sizeof (vsa)); + vsa.vsa_mask = VSA_ACE | VSA_ACECNT | VSA_ACE_ACLFLAGS; + vsa.vsa_aclcnt = lr->lr_aclcnt; + vsa.vsa_aclentp = ace; + vsa.vsa_aclentsz = lr->lr_acl_bytes; + vsa.vsa_aclflags = lr->lr_acl_flags; + + if (lr->lr_fuidcnt) { + void *fuidstart = (caddr_t)ace + + ZIL_ACE_LENGTH(lr->lr_acl_bytes); + + zfsvfs->z_fuid_replay = + zfs_replay_fuids(fuidstart, &fuidstart, + lr->lr_fuidcnt, lr->lr_domcnt, 0, 0); + } + + error = VOP_SETSECATTR(ZTOV(zp), &vsa, 0, kcred, NULL); + + if (zfsvfs->z_fuid_replay) + zfs_fuid_info_free(zfsvfs->z_fuid_replay); + + zfsvfs->z_fuid_replay = NULL; + VN_RELE(ZTOV(zp)); + + return (error); +} + +/* + * Callback vectors for replaying records + */ +zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE] = { + zfs_replay_error, /* 0 no such transaction type */ + zfs_replay_create, /* TX_CREATE */ + zfs_replay_create, /* TX_MKDIR */ + zfs_replay_create, /* TX_MKXATTR */ + zfs_replay_create, /* TX_SYMLINK */ + zfs_replay_remove, /* TX_REMOVE */ + zfs_replay_remove, /* TX_RMDIR */ + zfs_replay_link, /* TX_LINK */ + zfs_replay_rename, /* TX_RENAME */ + zfs_replay_write, /* TX_WRITE */ + zfs_replay_truncate, /* TX_TRUNCATE */ + zfs_replay_setattr, /* TX_SETATTR */ + zfs_replay_acl_v0, /* TX_ACL_V0 */ + zfs_replay_acl, /* TX_ACL */ + zfs_replay_create_acl, /* TX_CREATE_ACL */ + zfs_replay_create, /* TX_CREATE_ATTR */ + zfs_replay_create_acl, /* TX_CREATE_ACL_ATTR */ + zfs_replay_create_acl, /* TX_MKDIR_ACL */ + zfs_replay_create, /* TX_MKDIR_ATTR */ + zfs_replay_create_acl, /* TX_MKDIR_ACL_ATTR */ +}; diff --git a/zfs/lib/libdmu-ctl/zfs_rlock.c b/zfs/lib/libdmu-ctl/zfs_rlock.c new file mode 100644 index 0000000000..44ec73b5df --- /dev/null +++ b/zfs/lib/libdmu-ctl/zfs_rlock.c @@ -0,0 +1,602 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "@(#)zfs_rlock.c 1.4 07/08/08 SMI" + +/* + * This file contains the code to implement file range locking in + * ZFS, although there isn't much specific to ZFS (all that comes to mind + * support for growing the blocksize). + * + * Interface + * --------- + * Defined in zfs_rlock.h but essentially: + * rl = zfs_range_lock(zp, off, len, lock_type); + * zfs_range_unlock(rl); + * zfs_range_reduce(rl, off, len); + * + * AVL tree + * -------- + * An AVL tree is used to maintain the state of the existing ranges + * that are locked for exclusive (writer) or shared (reader) use. + * The starting range offset is used for searching and sorting the tree. + * + * Common case + * ----------- + * The (hopefully) usual case is of no overlaps or contention for + * locks. On entry to zfs_lock_range() a rl_t is allocated; the tree + * searched that finds no overlap, and *this* rl_t is placed in the tree. + * + * Overlaps/Reference counting/Proxy locks + * --------------------------------------- + * The avl code only allows one node at a particular offset. Also it's very + * inefficient to search through all previous entries looking for overlaps + * (because the very 1st in the ordered list might be at offset 0 but + * cover the whole file). + * So this implementation uses reference counts and proxy range locks. + * Firstly, only reader locks use reference counts and proxy locks, + * because writer locks are exclusive. + * When a reader lock overlaps with another then a proxy lock is created + * for that range and replaces the original lock. If the overlap + * is exact then the reference count of the proxy is simply incremented. + * Otherwise, the proxy lock is split into smaller lock ranges and + * new proxy locks created for non overlapping ranges. + * The reference counts are adjusted accordingly. + * Meanwhile, the orginal lock is kept around (this is the callers handle) + * and its offset and length are used when releasing the lock. + * + * Thread coordination + * ------------------- + * In order to make wakeups efficient and to ensure multiple continuous + * readers on a range don't starve a writer for the same range lock, + * two condition variables are allocated in each rl_t. + * If a writer (or reader) can't get a range it initialises the writer + * (or reader) cv; sets a flag saying there's a writer (or reader) waiting; + * and waits on that cv. When a thread unlocks that range it wakes up all + * writers then all readers before destroying the lock. + * + * Append mode writes + * ------------------ + * Append mode writes need to lock a range at the end of a file. + * The offset of the end of the file is determined under the + * range locking mutex, and the lock type converted from RL_APPEND to + * RL_WRITER and the range locked. + * + * Grow block handling + * ------------------- + * ZFS supports multiple block sizes currently upto 128K. The smallest + * block size is used for the file which is grown as needed. During this + * growth all other writers and readers must be excluded. + * So if the block size needs to be grown then the whole file is + * exclusively locked, then later the caller will reduce the lock + * range to just the range to be written using zfs_reduce_range. + */ + +#include + +/* + * Check if a write lock can be grabbed, or wait and recheck until available. + */ +static void +zfs_range_lock_writer(znode_t *zp, rl_t *new) +{ + avl_tree_t *tree = &zp->z_range_avl; + rl_t *rl; + avl_index_t where; + uint64_t end_size; + uint64_t off = new->r_off; + uint64_t len = new->r_len; + + for (;;) { + /* + * Range locking is also used by zvol and uses a + * dummied up znode. However, for zvol, we don't need to + * append or grow blocksize, and besides we don't have + * a z_phys or z_zfsvfs - so skip that processing. + * + * Yes, this is ugly, and would be solved by not handling + * grow or append in range lock code. If that was done then + * we could make the range locking code generically available + * to other non-zfs consumers. + */ + if (zp->z_vnode) { /* caller is ZPL */ + /* + * If in append mode pick up the current end of file. + * This is done under z_range_lock to avoid races. + */ + if (new->r_type == RL_APPEND) + new->r_off = zp->z_phys->zp_size; + + /* + * If we need to grow the block size then grab the whole + * file range. This is also done under z_range_lock to + * avoid races. + */ + end_size = MAX(zp->z_phys->zp_size, new->r_off + len); + if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) || + zp->z_blksz < zp->z_zfsvfs->z_max_blksz)) { + new->r_off = 0; + new->r_len = UINT64_MAX; + } + } + + /* + * First check for the usual case of no locks + */ + if (avl_numnodes(tree) == 0) { + new->r_type = RL_WRITER; /* convert to writer */ + avl_add(tree, new); + return; + } + + /* + * Look for any locks in the range. + */ + rl = avl_find(tree, new, &where); + if (rl) + goto wait; /* already locked at same offset */ + + rl = (rl_t *)avl_nearest(tree, where, AVL_AFTER); + if (rl && (rl->r_off < new->r_off + new->r_len)) + goto wait; + + rl = (rl_t *)avl_nearest(tree, where, AVL_BEFORE); + if (rl && rl->r_off + rl->r_len > new->r_off) + goto wait; + + new->r_type = RL_WRITER; /* convert possible RL_APPEND */ + avl_insert(tree, new, where); + return; +wait: + if (!rl->r_write_wanted) { + cv_init(&rl->r_wr_cv, NULL, CV_DEFAULT, NULL); + rl->r_write_wanted = B_TRUE; + } + cv_wait(&rl->r_wr_cv, &zp->z_range_lock); + + /* reset to original */ + new->r_off = off; + new->r_len = len; + } +} + +/* + * If this is an original (non-proxy) lock then replace it by + * a proxy and return the proxy. + */ +static rl_t * +zfs_range_proxify(avl_tree_t *tree, rl_t *rl) +{ + rl_t *proxy; + + if (rl->r_proxy) + return (rl); /* already a proxy */ + + ASSERT3U(rl->r_cnt, ==, 1); + ASSERT(rl->r_write_wanted == B_FALSE); + ASSERT(rl->r_read_wanted == B_FALSE); + avl_remove(tree, rl); + rl->r_cnt = 0; + + /* create a proxy range lock */ + proxy = kmem_alloc(sizeof (rl_t), KM_SLEEP); + proxy->r_off = rl->r_off; + proxy->r_len = rl->r_len; + proxy->r_cnt = 1; + proxy->r_type = RL_READER; + proxy->r_proxy = B_TRUE; + proxy->r_write_wanted = B_FALSE; + proxy->r_read_wanted = B_FALSE; + avl_add(tree, proxy); + + return (proxy); +} + +/* + * Split the range lock at the supplied offset + * returning the *front* proxy. + */ +static rl_t * +zfs_range_split(avl_tree_t *tree, rl_t *rl, uint64_t off) +{ + rl_t *front, *rear; + + ASSERT3U(rl->r_len, >, 1); + ASSERT3U(off, >, rl->r_off); + ASSERT3U(off, <, rl->r_off + rl->r_len); + ASSERT(rl->r_write_wanted == B_FALSE); + ASSERT(rl->r_read_wanted == B_FALSE); + + /* create the rear proxy range lock */ + rear = kmem_alloc(sizeof (rl_t), KM_SLEEP); + rear->r_off = off; + rear->r_len = rl->r_off + rl->r_len - off; + rear->r_cnt = rl->r_cnt; + rear->r_type = RL_READER; + rear->r_proxy = B_TRUE; + rear->r_write_wanted = B_FALSE; + rear->r_read_wanted = B_FALSE; + + front = zfs_range_proxify(tree, rl); + front->r_len = off - rl->r_off; + + avl_insert_here(tree, rear, front, AVL_AFTER); + return (front); +} + +/* + * Create and add a new proxy range lock for the supplied range. + */ +static void +zfs_range_new_proxy(avl_tree_t *tree, uint64_t off, uint64_t len) +{ + rl_t *rl; + + ASSERT(len); + rl = kmem_alloc(sizeof (rl_t), KM_SLEEP); + rl->r_off = off; + rl->r_len = len; + rl->r_cnt = 1; + rl->r_type = RL_READER; + rl->r_proxy = B_TRUE; + rl->r_write_wanted = B_FALSE; + rl->r_read_wanted = B_FALSE; + avl_add(tree, rl); +} + +static void +zfs_range_add_reader(avl_tree_t *tree, rl_t *new, rl_t *prev, avl_index_t where) +{ + rl_t *next; + uint64_t off = new->r_off; + uint64_t len = new->r_len; + + /* + * prev arrives either: + * - pointing to an entry at the same offset + * - pointing to the entry with the closest previous offset whose + * range may overlap with the new range + * - null, if there were no ranges starting before the new one + */ + if (prev) { + if (prev->r_off + prev->r_len <= off) { + prev = NULL; + } else if (prev->r_off != off) { + /* + * convert to proxy if needed then + * split this entry and bump ref count + */ + prev = zfs_range_split(tree, prev, off); + prev = AVL_NEXT(tree, prev); /* move to rear range */ + } + } + ASSERT((prev == NULL) || (prev->r_off == off)); + + if (prev) + next = prev; + else + next = (rl_t *)avl_nearest(tree, where, AVL_AFTER); + + if (next == NULL || off + len <= next->r_off) { + /* no overlaps, use the original new rl_t in the tree */ + avl_insert(tree, new, where); + return; + } + + if (off < next->r_off) { + /* Add a proxy for initial range before the overlap */ + zfs_range_new_proxy(tree, off, next->r_off - off); + } + + new->r_cnt = 0; /* will use proxies in tree */ + /* + * We now search forward through the ranges, until we go past the end + * of the new range. For each entry we make it a proxy if it + * isn't already, then bump its reference count. If there's any + * gaps between the ranges then we create a new proxy range. + */ + for (prev = NULL; next; prev = next, next = AVL_NEXT(tree, next)) { + if (off + len <= next->r_off) + break; + if (prev && prev->r_off + prev->r_len < next->r_off) { + /* there's a gap */ + ASSERT3U(next->r_off, >, prev->r_off + prev->r_len); + zfs_range_new_proxy(tree, prev->r_off + prev->r_len, + next->r_off - (prev->r_off + prev->r_len)); + } + if (off + len == next->r_off + next->r_len) { + /* exact overlap with end */ + next = zfs_range_proxify(tree, next); + next->r_cnt++; + return; + } + if (off + len < next->r_off + next->r_len) { + /* new range ends in the middle of this block */ + next = zfs_range_split(tree, next, off + len); + next->r_cnt++; + return; + } + ASSERT3U(off + len, >, next->r_off + next->r_len); + next = zfs_range_proxify(tree, next); + next->r_cnt++; + } + + /* Add the remaining end range. */ + zfs_range_new_proxy(tree, prev->r_off + prev->r_len, + (off + len) - (prev->r_off + prev->r_len)); +} + +/* + * Check if a reader lock can be grabbed, or wait and recheck until available. + */ +static void +zfs_range_lock_reader(znode_t *zp, rl_t *new) +{ + avl_tree_t *tree = &zp->z_range_avl; + rl_t *prev, *next; + avl_index_t where; + uint64_t off = new->r_off; + uint64_t len = new->r_len; + + /* + * Look for any writer locks in the range. + */ +retry: + prev = avl_find(tree, new, &where); + if (prev == NULL) + prev = (rl_t *)avl_nearest(tree, where, AVL_BEFORE); + + /* + * Check the previous range for a writer lock overlap. + */ + if (prev && (off < prev->r_off + prev->r_len)) { + if ((prev->r_type == RL_WRITER) || (prev->r_write_wanted)) { + if (!prev->r_read_wanted) { + cv_init(&prev->r_rd_cv, NULL, CV_DEFAULT, NULL); + prev->r_read_wanted = B_TRUE; + } + cv_wait(&prev->r_rd_cv, &zp->z_range_lock); + goto retry; + } + if (off + len < prev->r_off + prev->r_len) + goto got_lock; + } + + /* + * Search through the following ranges to see if there's + * write lock any overlap. + */ + if (prev) + next = AVL_NEXT(tree, prev); + else + next = (rl_t *)avl_nearest(tree, where, AVL_AFTER); + for (; next; next = AVL_NEXT(tree, next)) { + if (off + len <= next->r_off) + goto got_lock; + if ((next->r_type == RL_WRITER) || (next->r_write_wanted)) { + if (!next->r_read_wanted) { + cv_init(&next->r_rd_cv, NULL, CV_DEFAULT, NULL); + next->r_read_wanted = B_TRUE; + } + cv_wait(&next->r_rd_cv, &zp->z_range_lock); + goto retry; + } + if (off + len <= next->r_off + next->r_len) + goto got_lock; + } + +got_lock: + /* + * Add the read lock, which may involve splitting existing + * locks and bumping ref counts (r_cnt). + */ + zfs_range_add_reader(tree, new, prev, where); +} + +/* + * Lock a range (offset, length) as either shared (RL_READER) + * or exclusive (RL_WRITER). Returns the range lock structure + * for later unlocking or reduce range (if entire file + * previously locked as RL_WRITER). + */ +rl_t * +zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type) +{ + rl_t *new; + + ASSERT(type == RL_READER || type == RL_WRITER || type == RL_APPEND); + + new = kmem_alloc(sizeof (rl_t), KM_SLEEP); + new->r_zp = zp; + new->r_off = off; + new->r_len = len; + new->r_cnt = 1; /* assume it's going to be in the tree */ + new->r_type = type; + new->r_proxy = B_FALSE; + new->r_write_wanted = B_FALSE; + new->r_read_wanted = B_FALSE; + + mutex_enter(&zp->z_range_lock); + if (type == RL_READER) { + /* + * First check for the usual case of no locks + */ + if (avl_numnodes(&zp->z_range_avl) == 0) + avl_add(&zp->z_range_avl, new); + else + zfs_range_lock_reader(zp, new); + } else + zfs_range_lock_writer(zp, new); /* RL_WRITER or RL_APPEND */ + mutex_exit(&zp->z_range_lock); + return (new); +} + +/* + * Unlock a reader lock + */ +static void +zfs_range_unlock_reader(znode_t *zp, rl_t *remove) +{ + avl_tree_t *tree = &zp->z_range_avl; + rl_t *rl, *next; + uint64_t len; + + /* + * The common case is when the remove entry is in the tree + * (cnt == 1) meaning there's been no other reader locks overlapping + * with this one. Otherwise the remove entry will have been + * removed from the tree and replaced by proxies (one or + * more ranges mapping to the entire range). + */ + if (remove->r_cnt == 1) { + avl_remove(tree, remove); + if (remove->r_write_wanted) { + cv_broadcast(&remove->r_wr_cv); + cv_destroy(&remove->r_wr_cv); + } + if (remove->r_read_wanted) { + cv_broadcast(&remove->r_rd_cv); + cv_destroy(&remove->r_rd_cv); + } + } else { + ASSERT3U(remove->r_cnt, ==, 0); + ASSERT3U(remove->r_write_wanted, ==, 0); + ASSERT3U(remove->r_read_wanted, ==, 0); + /* + * Find start proxy representing this reader lock, + * then decrement ref count on all proxies + * that make up this range, freeing them as needed. + */ + rl = avl_find(tree, remove, NULL); + ASSERT(rl); + ASSERT(rl->r_cnt); + ASSERT(rl->r_type == RL_READER); + for (len = remove->r_len; len != 0; rl = next) { + len -= rl->r_len; + if (len) { + next = AVL_NEXT(tree, rl); + ASSERT(next); + ASSERT(rl->r_off + rl->r_len == next->r_off); + ASSERT(next->r_cnt); + ASSERT(next->r_type == RL_READER); + } + rl->r_cnt--; + if (rl->r_cnt == 0) { + avl_remove(tree, rl); + if (rl->r_write_wanted) { + cv_broadcast(&rl->r_wr_cv); + cv_destroy(&rl->r_wr_cv); + } + if (rl->r_read_wanted) { + cv_broadcast(&rl->r_rd_cv); + cv_destroy(&rl->r_rd_cv); + } + kmem_free(rl, sizeof (rl_t)); + } + } + } + kmem_free(remove, sizeof (rl_t)); +} + +/* + * Unlock range and destroy range lock structure. + */ +void +zfs_range_unlock(rl_t *rl) +{ + znode_t *zp = rl->r_zp; + + ASSERT(rl->r_type == RL_WRITER || rl->r_type == RL_READER); + ASSERT(rl->r_cnt == 1 || rl->r_cnt == 0); + ASSERT(!rl->r_proxy); + + mutex_enter(&zp->z_range_lock); + if (rl->r_type == RL_WRITER) { + /* writer locks can't be shared or split */ + avl_remove(&zp->z_range_avl, rl); + mutex_exit(&zp->z_range_lock); + if (rl->r_write_wanted) { + cv_broadcast(&rl->r_wr_cv); + cv_destroy(&rl->r_wr_cv); + } + if (rl->r_read_wanted) { + cv_broadcast(&rl->r_rd_cv); + cv_destroy(&rl->r_rd_cv); + } + kmem_free(rl, sizeof (rl_t)); + } else { + /* + * lock may be shared, let zfs_range_unlock_reader() + * release the lock and free the rl_t + */ + zfs_range_unlock_reader(zp, rl); + mutex_exit(&zp->z_range_lock); + } +} + +/* + * Reduce range locked as RL_WRITER from whole file to specified range. + * Asserts the whole file is exclusivly locked and so there's only one + * entry in the tree. + */ +void +zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len) +{ + znode_t *zp = rl->r_zp; + + /* Ensure there are no other locks */ + ASSERT(avl_numnodes(&zp->z_range_avl) == 1); + ASSERT(rl->r_off == 0); + ASSERT(rl->r_type == RL_WRITER); + ASSERT(!rl->r_proxy); + ASSERT3U(rl->r_len, ==, UINT64_MAX); + ASSERT3U(rl->r_cnt, ==, 1); + + mutex_enter(&zp->z_range_lock); + rl->r_off = off; + rl->r_len = len; + mutex_exit(&zp->z_range_lock); + if (rl->r_write_wanted) + cv_broadcast(&rl->r_wr_cv); + if (rl->r_read_wanted) + cv_broadcast(&rl->r_rd_cv); +} + +/* + * AVL comparison function used to order range locks + * Locks are ordered on the start offset of the range. + */ +int +zfs_range_compare(const void *arg1, const void *arg2) +{ + const rl_t *rl1 = arg1; + const rl_t *rl2 = arg2; + + if (rl1->r_off > rl2->r_off) + return (1); + if (rl1->r_off < rl2->r_off) + return (-1); + return (0); +} diff --git a/zfs/lib/libdmu-ctl/zfs_vfsops.c b/zfs/lib/libdmu-ctl/zfs_vfsops.c new file mode 100644 index 0000000000..39c8ce4ef8 --- /dev/null +++ b/zfs/lib/libdmu-ctl/zfs_vfsops.c @@ -0,0 +1,1671 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "@(#)zfs_vfsops.c 1.41 08/04/11 SMI" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "fs/fs_subr.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int zfsfstype; +vfsops_t *zfs_vfsops = NULL; +static major_t zfs_major; +static minor_t zfs_minor; +static kmutex_t zfs_dev_mtx; + +static int zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr); +static int zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr); +static int zfs_mountroot(vfs_t *vfsp, enum whymountroot); +static int zfs_root(vfs_t *vfsp, vnode_t **vpp); +static int zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp); +static int zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp); +static void zfs_freevfs(vfs_t *vfsp); + +static const fs_operation_def_t zfs_vfsops_template[] = { + VFSNAME_MOUNT, { .vfs_mount = zfs_mount }, + VFSNAME_MOUNTROOT, { .vfs_mountroot = zfs_mountroot }, + VFSNAME_UNMOUNT, { .vfs_unmount = zfs_umount }, + VFSNAME_ROOT, { .vfs_root = zfs_root }, + VFSNAME_STATVFS, { .vfs_statvfs = zfs_statvfs }, + VFSNAME_SYNC, { .vfs_sync = zfs_sync }, + VFSNAME_VGET, { .vfs_vget = zfs_vget }, + VFSNAME_FREEVFS, { .vfs_freevfs = zfs_freevfs }, + NULL, NULL +}; + +static const fs_operation_def_t zfs_vfsops_eio_template[] = { + VFSNAME_FREEVFS, { .vfs_freevfs = zfs_freevfs }, + NULL, NULL +}; + +/* + * We need to keep a count of active fs's. + * This is necessary to prevent our module + * from being unloaded after a umount -f + */ +static uint32_t zfs_active_fs_count = 0; + +static char *noatime_cancel[] = { MNTOPT_ATIME, NULL }; +static char *atime_cancel[] = { MNTOPT_NOATIME, NULL }; +static char *noxattr_cancel[] = { MNTOPT_XATTR, NULL }; +static char *xattr_cancel[] = { MNTOPT_NOXATTR, NULL }; + +/* + * MO_DEFAULT is not used since the default value is determined + * by the equivalent property. + */ +static mntopt_t mntopts[] = { + { MNTOPT_NOXATTR, noxattr_cancel, NULL, 0, NULL }, + { MNTOPT_XATTR, xattr_cancel, NULL, 0, NULL }, + { MNTOPT_NOATIME, noatime_cancel, NULL, 0, NULL }, + { MNTOPT_ATIME, atime_cancel, NULL, 0, NULL } +}; + +static mntopts_t zfs_mntopts = { + sizeof (mntopts) / sizeof (mntopt_t), + mntopts +}; + +/*ARGSUSED*/ +int +zfs_sync(vfs_t *vfsp, short flag, cred_t *cr) +{ + /* + * Data integrity is job one. We don't want a compromised kernel + * writing to the storage pool, so we never sync during panic. + */ + if (panicstr) + return (0); + + /* + * SYNC_ATTR is used by fsflush() to force old filesystems like UFS + * to sync metadata, which they would otherwise cache indefinitely. + * Semantically, the only requirement is that the sync be initiated. + * The DMU syncs out txgs frequently, so there's nothing to do. + */ + if (flag & SYNC_ATTR) + return (0); + + if (vfsp != NULL) { + /* + * Sync a specific filesystem. + */ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + + ZFS_ENTER(zfsvfs); + if (zfsvfs->z_log != NULL) + zil_commit(zfsvfs->z_log, UINT64_MAX, 0); + else + txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); + ZFS_EXIT(zfsvfs); + } else { + /* + * Sync all ZFS filesystems. This is what happens when you + * run sync(1M). Unlike other filesystems, ZFS honors the + * request by waiting for all pools to commit all dirty data. + */ + spa_sync_allpools(); + } + + return (0); +} + +static int +zfs_create_unique_device(dev_t *dev) +{ + major_t new_major; + + do { + ASSERT3U(zfs_minor, <=, MAXMIN32); + minor_t start = zfs_minor; + do { + mutex_enter(&zfs_dev_mtx); + if (zfs_minor >= MAXMIN32) { + /* + * If we're still using the real major + * keep out of /dev/zfs and /dev/zvol minor + * number space. If we're using a getudev()'ed + * major number, we can use all of its minors. + */ + if (zfs_major == ddi_name_to_major(ZFS_DRIVER)) + zfs_minor = ZFS_MIN_MINOR; + else + zfs_minor = 0; + } else { + zfs_minor++; + } + *dev = makedevice(zfs_major, zfs_minor); + mutex_exit(&zfs_dev_mtx); + } while (vfs_devismounted(*dev) && zfs_minor != start); + if (zfs_minor == start) { + /* + * We are using all ~262,000 minor numbers for the + * current major number. Create a new major number. + */ + if ((new_major = getudev()) == (major_t)-1) { + cmn_err(CE_WARN, + "zfs_mount: Can't get unique major " + "device number."); + return (-1); + } + mutex_enter(&zfs_dev_mtx); + zfs_major = new_major; + zfs_minor = 0; + + mutex_exit(&zfs_dev_mtx); + } else { + break; + } + /* CONSTANTCONDITION */ + } while (1); + + return (0); +} + +static void +atime_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + if (newval == TRUE) { + zfsvfs->z_atime = TRUE; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0); + } else { + zfsvfs->z_atime = FALSE; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0); + } +} + +static void +xattr_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + if (newval == TRUE) { + /* XXX locking on vfs_flag? */ + zfsvfs->z_vfs->vfs_flag |= VFS_XATTR; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0); + } else { + /* XXX locking on vfs_flag? */ + zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0); + } +} + +static void +blksz_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + if (newval < SPA_MINBLOCKSIZE || + newval > SPA_MAXBLOCKSIZE || !ISP2(newval)) + newval = SPA_MAXBLOCKSIZE; + + zfsvfs->z_max_blksz = newval; + zfsvfs->z_vfs->vfs_bsize = newval; +} + +static void +readonly_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + if (newval) { + /* XXX locking on vfs_flag? */ + zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0); + } else { + /* XXX locking on vfs_flag? */ + zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0); + } +} + +static void +devices_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + if (newval == FALSE) { + zfsvfs->z_vfs->vfs_flag |= VFS_NODEVICES; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES, NULL, 0); + } else { + zfsvfs->z_vfs->vfs_flag &= ~VFS_NODEVICES; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES, NULL, 0); + } +} + +static void +setuid_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + if (newval == FALSE) { + zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0); + } else { + zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0); + } +} + +static void +exec_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + if (newval == FALSE) { + zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0); + } else { + zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0); + } +} + +/* + * The nbmand mount option can be changed at mount time. + * We can't allow it to be toggled on live file systems or incorrect + * behavior may be seen from cifs clients + * + * This property isn't registered via dsl_prop_register(), but this callback + * will be called when a file system is first mounted + */ +static void +nbmand_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + if (newval == FALSE) { + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0); + } else { + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0); + } +} + +static void +snapdir_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + zfsvfs->z_show_ctldir = newval; +} + +static void +vscan_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + zfsvfs->z_vscan = newval; +} + +static void +acl_mode_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + zfsvfs->z_acl_mode = newval; +} + +static void +acl_inherit_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + zfsvfs->z_acl_inherit = newval; +} + +static int +zfs_register_callbacks(vfs_t *vfsp) +{ + struct dsl_dataset *ds = NULL; + objset_t *os = NULL; + zfsvfs_t *zfsvfs = NULL; + uint64_t nbmand; + int readonly, do_readonly = B_FALSE; + int setuid, do_setuid = B_FALSE; + int exec, do_exec = B_FALSE; + int devices, do_devices = B_FALSE; + int xattr, do_xattr = B_FALSE; + int atime, do_atime = B_FALSE; + int error = 0; + + ASSERT(vfsp); + zfsvfs = vfsp->vfs_data; + ASSERT(zfsvfs); + os = zfsvfs->z_os; + + /* + * The act of registering our callbacks will destroy any mount + * options we may have. In order to enable temporary overrides + * of mount options, we stash away the current values and + * restore them after we register the callbacks. + */ + if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) { + readonly = B_TRUE; + do_readonly = B_TRUE; + } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) { + readonly = B_FALSE; + do_readonly = B_TRUE; + } + if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) { + devices = B_FALSE; + setuid = B_FALSE; + do_devices = B_TRUE; + do_setuid = B_TRUE; + } else { + if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) { + devices = B_FALSE; + do_devices = B_TRUE; + } else if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL)) { + devices = B_TRUE; + do_devices = B_TRUE; + } + + if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) { + setuid = B_FALSE; + do_setuid = B_TRUE; + } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) { + setuid = B_TRUE; + do_setuid = B_TRUE; + } + } + if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) { + exec = B_FALSE; + do_exec = B_TRUE; + } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) { + exec = B_TRUE; + do_exec = B_TRUE; + } + if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) { + xattr = B_FALSE; + do_xattr = B_TRUE; + } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) { + xattr = B_TRUE; + do_xattr = B_TRUE; + } + if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) { + atime = B_FALSE; + do_atime = B_TRUE; + } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) { + atime = B_TRUE; + do_atime = B_TRUE; + } + + /* + * nbmand is a special property. It can only be changed at + * mount time. + * + * This is weird, but it is documented to only be changeable + * at mount time. + */ + if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) { + nbmand = B_FALSE; + } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) { + nbmand = B_TRUE; + } else { + char osname[MAXNAMELEN]; + + dmu_objset_name(os, osname); + if (error = dsl_prop_get_integer(osname, "nbmand", &nbmand, + NULL)) + return (error); + } + + /* + * Register property callbacks. + * + * It would probably be fine to just check for i/o error from + * the first prop_register(), but I guess I like to go + * overboard... + */ + ds = dmu_objset_ds(os); + error = dsl_prop_register(ds, "atime", atime_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + "xattr", xattr_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + "recordsize", blksz_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + "readonly", readonly_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + "devices", devices_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + "setuid", setuid_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + "exec", exec_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + "snapdir", snapdir_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + "aclmode", acl_mode_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + "aclinherit", acl_inherit_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + "vscan", vscan_changed_cb, zfsvfs); + if (error) + goto unregister; + + /* + * Invoke our callbacks to restore temporary mount options. + */ + if (do_readonly) + readonly_changed_cb(zfsvfs, readonly); + if (do_setuid) + setuid_changed_cb(zfsvfs, setuid); + if (do_exec) + exec_changed_cb(zfsvfs, exec); + if (do_devices) + devices_changed_cb(zfsvfs, devices); + if (do_xattr) + xattr_changed_cb(zfsvfs, xattr); + if (do_atime) + atime_changed_cb(zfsvfs, atime); + + nbmand_changed_cb(zfsvfs, nbmand); + + return (0); + +unregister: + /* + * We may attempt to unregister some callbacks that are not + * registered, but this is OK; it will simply return ENOMSG, + * which we will ignore. + */ + (void) dsl_prop_unregister(ds, "atime", atime_changed_cb, zfsvfs); + (void) dsl_prop_unregister(ds, "xattr", xattr_changed_cb, zfsvfs); + (void) dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, zfsvfs); + (void) dsl_prop_unregister(ds, "readonly", readonly_changed_cb, zfsvfs); + (void) dsl_prop_unregister(ds, "devices", devices_changed_cb, zfsvfs); + (void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs); + (void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs); + (void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs); + (void) dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs); + (void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb, + zfsvfs); + (void) dsl_prop_unregister(ds, "vscan", vscan_changed_cb, zfsvfs); + return (error); + +} + +static int +zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) +{ + uint_t readonly; + int error; + + error = zfs_register_callbacks(zfsvfs->z_vfs); + if (error) + return (error); + + /* + * Set the objset user_ptr to track its zfsvfs. + */ + mutex_enter(&zfsvfs->z_os->os->os_user_ptr_lock); + dmu_objset_set_user(zfsvfs->z_os, zfsvfs); + mutex_exit(&zfsvfs->z_os->os->os_user_ptr_lock); + + /* + * If we are not mounting (ie: online recv), then we don't + * have to worry about replaying the log as we blocked all + * operations out since we closed the ZIL. + */ + if (mounting) { + /* + * During replay we remove the read only flag to + * allow replays to succeed. + */ + readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY; + if (readonly != 0) + zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; + else + zfs_unlinked_drain(zfsvfs); + + /* + * Parse and replay the intent log. + * + * Because of ziltest, this must be done after + * zfs_unlinked_drain(). (Further note: ziltest doesn't + * use readonly mounts, where zfs_unlinked_drain() isn't + * called.) This is because ziltest causes spa_sync() + * to think it's committed, but actually it is not, so + * the intent log contains many txg's worth of changes. + * + * In particular, if object N is in the unlinked set in + * the last txg to actually sync, then it could be + * actually freed in a later txg and then reallocated in + * a yet later txg. This would write a "create object + * N" record to the intent log. Normally, this would be + * fine because the spa_sync() would have written out + * the fact that object N is free, before we could write + * the "create object N" intent log record. + * + * But when we are in ziltest mode, we advance the "open + * txg" without actually spa_sync()-ing the changes to + * disk. So we would see that object N is still + * allocated and in the unlinked set, and there is an + * intent log record saying to allocate it. + */ + zil_replay(zfsvfs->z_os, zfsvfs, &zfsvfs->z_assign, + zfs_replay_vector); + + zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */ + } + + if (!zil_disable) + zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data); + + return (0); +} + +static void +zfs_freezfsvfs(zfsvfs_t *zfsvfs) +{ + mutex_destroy(&zfsvfs->z_znodes_lock); + mutex_destroy(&zfsvfs->z_online_recv_lock); + list_destroy(&zfsvfs->z_all_znodes); + rrw_destroy(&zfsvfs->z_teardown_lock); + rw_destroy(&zfsvfs->z_teardown_inactive_lock); + rw_destroy(&zfsvfs->z_fuid_lock); + kmem_free(zfsvfs, sizeof (zfsvfs_t)); +} + +static int +zfs_domount(vfs_t *vfsp, char *osname, cred_t *cr) +{ + dev_t mount_dev; + uint64_t recordsize, readonly; + int error = 0; + int mode; + zfsvfs_t *zfsvfs; + znode_t *zp = NULL; + + ASSERT(vfsp); + ASSERT(osname); + + /* + * Initialize the zfs-specific filesystem structure. + * Should probably make this a kmem cache, shuffle fields, + * and just bzero up to z_hold_mtx[]. + */ + zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); + zfsvfs->z_vfs = vfsp; + zfsvfs->z_parent = zfsvfs; + zfsvfs->z_assign = TXG_NOWAIT; + zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE; + zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; + + mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&zfsvfs->z_online_recv_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), + offsetof(znode_t, z_link_node)); + rrw_init(&zfsvfs->z_teardown_lock); + rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL); + rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); + + /* Initialize the generic filesystem structure. */ + vfsp->vfs_bcount = 0; + vfsp->vfs_data = NULL; + + if (zfs_create_unique_device(&mount_dev) == -1) { + error = ENODEV; + goto out; + } + ASSERT(vfs_devismounted(mount_dev) == 0); + + if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize, + NULL)) + goto out; + + vfsp->vfs_dev = mount_dev; + vfsp->vfs_fstype = zfsfstype; + vfsp->vfs_bsize = recordsize; + vfsp->vfs_flag |= VFS_NOTRUNC; + vfsp->vfs_data = zfsvfs; + + if (error = dsl_prop_get_integer(osname, "readonly", &readonly, NULL)) + goto out; + + if (readonly) + mode = DS_MODE_PRIMARY | DS_MODE_READONLY; + else + mode = DS_MODE_PRIMARY; + + error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os); + if (error == EROFS) { + mode = DS_MODE_PRIMARY | DS_MODE_READONLY; + error = dmu_objset_open(osname, DMU_OST_ZFS, mode, + &zfsvfs->z_os); + } + + if (error) + goto out; + + if (error = zfs_init_fs(zfsvfs, &zp, cr)) + goto out; + + /* The call to zfs_init_fs leaves the vnode held, release it here. */ + VN_RELE(ZTOV(zp)); + + /* + * Set features for file system. + */ + zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); + if (zfsvfs->z_use_fuids) { + vfs_set_feature(vfsp, VFSFT_XVATTR); + vfs_set_feature(vfsp, VFSFT_ACEMASKONACCESS); + vfs_set_feature(vfsp, VFSFT_ACLONCREATE); + } + if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { + vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); + vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); + vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE); + } else if (zfsvfs->z_case == ZFS_CASE_MIXED) { + vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); + vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); + } + + if (dmu_objset_is_snapshot(zfsvfs->z_os)) { + uint64_t pval; + + ASSERT(mode & DS_MODE_READONLY); + atime_changed_cb(zfsvfs, B_FALSE); + readonly_changed_cb(zfsvfs, B_TRUE); + if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL)) + goto out; + xattr_changed_cb(zfsvfs, pval); + zfsvfs->z_issnap = B_TRUE; + } else { + error = zfsvfs_setup(zfsvfs, B_TRUE); + } + + if (!zfsvfs->z_issnap) + zfsctl_create(zfsvfs); +out: + if (error) { + if (zfsvfs->z_os) + dmu_objset_close(zfsvfs->z_os); + zfs_freezfsvfs(zfsvfs); + } else { + atomic_add_32(&zfs_active_fs_count, 1); + } + + return (error); +} + +void +zfs_unregister_callbacks(zfsvfs_t *zfsvfs) +{ + objset_t *os = zfsvfs->z_os; + struct dsl_dataset *ds; + + /* + * Unregister properties. + */ + if (!dmu_objset_is_snapshot(os)) { + ds = dmu_objset_ds(os); + VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb, + zfsvfs) == 0); + + VERIFY(dsl_prop_unregister(ds, "xattr", xattr_changed_cb, + zfsvfs) == 0); + + VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, + zfsvfs) == 0); + + VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb, + zfsvfs) == 0); + + VERIFY(dsl_prop_unregister(ds, "devices", devices_changed_cb, + zfsvfs) == 0); + + VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb, + zfsvfs) == 0); + + VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb, + zfsvfs) == 0); + + VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, + zfsvfs) == 0); + + VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, + zfsvfs) == 0); + + VERIFY(dsl_prop_unregister(ds, "aclinherit", + acl_inherit_changed_cb, zfsvfs) == 0); + + VERIFY(dsl_prop_unregister(ds, "vscan", + vscan_changed_cb, zfsvfs) == 0); + } +} + +/* + * Convert a decimal digit string to a uint64_t integer. + */ +static int +str_to_uint64(char *str, uint64_t *objnum) +{ + uint64_t num = 0; + + while (*str) { + if (*str < '0' || *str > '9') + return (EINVAL); + + num = num*10 + *str++ - '0'; + } + + *objnum = num; + return (0); +} + +/* + * The boot path passed from the boot loader is in the form of + * "rootpool-name/root-filesystem-object-number'. Convert this + * string to a dataset name: "rootpool-name/root-filesystem-name". + */ +static int +zfs_parse_bootfs(char *bpath, char *outpath) +{ + char *slashp; + uint64_t objnum; + int error; + + if (*bpath == 0 || *bpath == '/') + return (EINVAL); + + slashp = strchr(bpath, '/'); + + /* if no '/', just return the pool name */ + if (slashp == NULL) { + (void) strcpy(outpath, bpath); + return (0); + } + + if (error = str_to_uint64(slashp+1, &objnum)) + return (error); + + *slashp = '\0'; + error = dsl_dsobj_to_dsname(bpath, objnum, outpath); + *slashp = '/'; + + return (error); +} + +static int +zfs_mountroot(vfs_t *vfsp, enum whymountroot why) +{ + int error = 0; + static int zfsrootdone = 0; + zfsvfs_t *zfsvfs = NULL; + znode_t *zp = NULL; + vnode_t *vp = NULL; + char *zfs_bootfs; + + ASSERT(vfsp); + + /* + * The filesystem that we mount as root is defined in the + * boot property "zfs-bootfs" with a format of + * "poolname/root-dataset-objnum". + */ + if (why == ROOT_INIT) { + if (zfsrootdone++) + return (EBUSY); + /* + * the process of doing a spa_load will require the + * clock to be set before we could (for example) do + * something better by looking at the timestamp on + * an uberblock, so just set it to -1. + */ + clkset(-1); + + if ((zfs_bootfs = spa_get_bootfs()) == NULL) { + cmn_err(CE_NOTE, "\nspa_get_bootfs: can not get " + "bootfs name \n"); + return (EINVAL); + } + + if (error = spa_import_rootpool(rootfs.bo_name)) { + spa_free_bootfs(zfs_bootfs); + cmn_err(CE_NOTE, "\nspa_import_rootpool: error %d\n", + error); + return (error); + } + + if (error = zfs_parse_bootfs(zfs_bootfs, rootfs.bo_name)) { + spa_free_bootfs(zfs_bootfs); + cmn_err(CE_NOTE, "\nzfs_parse_bootfs: error %d\n", + error); + return (error); + } + + spa_free_bootfs(zfs_bootfs); + + if (error = vfs_lock(vfsp)) + return (error); + + if (error = zfs_domount(vfsp, rootfs.bo_name, CRED())) { + cmn_err(CE_NOTE, "\nzfs_domount: error %d\n", error); + goto out; + } + + zfsvfs = (zfsvfs_t *)vfsp->vfs_data; + ASSERT(zfsvfs); + if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &zp)) { + cmn_err(CE_NOTE, "\nzfs_zget: error %d\n", error); + goto out; + } + + vp = ZTOV(zp); + mutex_enter(&vp->v_lock); + vp->v_flag |= VROOT; + mutex_exit(&vp->v_lock); + rootvp = vp; + + /* + * The zfs_zget call above returns with a hold on vp, we release + * it here. + */ + VN_RELE(vp); + + vfs_add((struct vnode *)0, vfsp, + (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0); +out: + vfs_unlock(vfsp); + return (error); + } else if (why == ROOT_REMOUNT) { + readonly_changed_cb(vfsp->vfs_data, B_FALSE); + vfsp->vfs_flag |= VFS_REMOUNT; + + /* refresh mount options */ + zfs_unregister_callbacks(vfsp->vfs_data); + return (zfs_register_callbacks(vfsp)); + + } else if (why == ROOT_UNMOUNT) { + zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data); + (void) zfs_sync(vfsp, 0, 0); + return (0); + } + + /* + * if "why" is equal to anything else other than ROOT_INIT, + * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it. + */ + return (ENOTSUP); +} + +/*ARGSUSED*/ +static int +zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) +{ + char *osname; + pathname_t spn; + int error = 0; + uio_seg_t fromspace = (uap->flags & MS_SYSSPACE) ? + UIO_SYSSPACE : UIO_USERSPACE; + int canwrite; + + if (mvp->v_type != VDIR) + return (ENOTDIR); + + mutex_enter(&mvp->v_lock); + if ((uap->flags & MS_REMOUNT) == 0 && + (uap->flags & MS_OVERLAY) == 0 && + (mvp->v_count != 1 || (mvp->v_flag & VROOT))) { + mutex_exit(&mvp->v_lock); + return (EBUSY); + } + mutex_exit(&mvp->v_lock); + + /* + * ZFS does not support passing unparsed data in via MS_DATA. + * Users should use the MS_OPTIONSTR interface; this means + * that all option parsing is already done and the options struct + * can be interrogated. + */ + if ((uap->flags & MS_DATA) && uap->datalen > 0) + return (EINVAL); + + /* + * Get the objset name (the "special" mount argument). + */ + if (error = pn_get(uap->spec, fromspace, &spn)) + return (error); + + osname = spn.pn_path; + + /* + * Check for mount privilege? + * + * If we don't have privilege then see if + * we have local permission to allow it + */ + error = secpolicy_fs_mount(cr, mvp, vfsp); + if (error) { + error = dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr); + if (error == 0) { + vattr_t vattr; + + /* + * Make sure user is the owner of the mount point + * or has sufficient privileges. + */ + + vattr.va_mask = AT_UID; + + if (error = VOP_GETATTR(mvp, &vattr, 0, cr, NULL)) { + goto out; + } + + if (secpolicy_vnode_owner(cr, vattr.va_uid) != 0 && + VOP_ACCESS(mvp, VWRITE, 0, cr, NULL) != 0) { + error = EPERM; + goto out; + } + + secpolicy_fs_mount_clearopts(cr, vfsp); + } else { + goto out; + } + } + + /* + * Refuse to mount a filesystem if we are in a local zone and the + * dataset is not visible. + */ + if (!INGLOBALZONE(curproc) && + (!zone_dataset_visible(osname, &canwrite) || !canwrite)) { + error = EPERM; + goto out; + } + + /* + * When doing a remount, we simply refresh our temporary properties + * according to those options set in the current VFS options. + */ + if (uap->flags & MS_REMOUNT) { + /* refresh mount options */ + zfs_unregister_callbacks(vfsp->vfs_data); + error = zfs_register_callbacks(vfsp); + goto out; + } + + error = zfs_domount(vfsp, osname, cr); + +out: + pn_free(&spn); + return (error); +} + +static int +zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp) +{ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + dev32_t d32; + uint64_t refdbytes, availbytes, usedobjs, availobjs; + + ZFS_ENTER(zfsvfs); + + dmu_objset_space(zfsvfs->z_os, + &refdbytes, &availbytes, &usedobjs, &availobjs); + + /* + * The underlying storage pool actually uses multiple block sizes. + * We report the fragsize as the smallest block size we support, + * and we report our blocksize as the filesystem's maximum blocksize. + */ + statp->f_frsize = 1UL << SPA_MINBLOCKSHIFT; + statp->f_bsize = zfsvfs->z_max_blksz; + + /* + * The following report "total" blocks of various kinds in the + * file system, but reported in terms of f_frsize - the + * "fragment" size. + */ + + statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT; + statp->f_bfree = availbytes >> SPA_MINBLOCKSHIFT; + statp->f_bavail = statp->f_bfree; /* no root reservation */ + + /* + * statvfs() should really be called statufs(), because it assumes + * static metadata. ZFS doesn't preallocate files, so the best + * we can do is report the max that could possibly fit in f_files, + * and that minus the number actually used in f_ffree. + * For f_ffree, report the smaller of the number of object available + * and the number of blocks (each object will take at least a block). + */ + statp->f_ffree = MIN(availobjs, statp->f_bfree); + statp->f_favail = statp->f_ffree; /* no "root reservation" */ + statp->f_files = statp->f_ffree + usedobjs; + + (void) cmpldev(&d32, vfsp->vfs_dev); + statp->f_fsid = d32; + + /* + * We're a zfs filesystem. + */ + (void) strcpy(statp->f_basetype, vfssw[vfsp->vfs_fstype].vsw_name); + + statp->f_flag = vf_to_stf(vfsp->vfs_flag); + + statp->f_namemax = ZFS_MAXNAMELEN; + + /* + * We have all of 32 characters to stuff a string here. + * Is there anything useful we could/should provide? + */ + bzero(statp->f_fstr, sizeof (statp->f_fstr)); + + ZFS_EXIT(zfsvfs); + return (0); +} + +static int +zfs_root(vfs_t *vfsp, vnode_t **vpp) +{ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + znode_t *rootzp; + int error; + + ZFS_ENTER(zfsvfs); + + error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); + if (error == 0) + *vpp = ZTOV(rootzp); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Teardown the zfsvfs::z_os. + * + * Note, if 'unmounting' if FALSE, we return with the 'z_teardown_lock' + * and 'z_teardown_inactive_lock' held. + */ +static int +zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) +{ + znode_t *zp; + + rrw_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); + + if (!unmounting) { + /* + * We purge the parent filesystem's vfsp as the parent + * filesystem and all of its snapshots have their vnode's + * v_vfsp set to the parent's filesystem's vfsp. Note, + * 'z_parent' is self referential for non-snapshots. + */ + (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0); + } + + /* + * Close the zil. NB: Can't close the zil while zfs_inactive + * threads are blocked as zil_close can call zfs_inactive. + */ + if (zfsvfs->z_log) { + zil_close(zfsvfs->z_log); + zfsvfs->z_log = NULL; + } + + rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER); + + /* + * If we are not unmounting (ie: online recv) and someone already + * unmounted this file system while we were doing the switcheroo, + * or a reopen of z_os failed then just bail out now. + */ + if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) { + rw_exit(&zfsvfs->z_teardown_inactive_lock); + rrw_exit(&zfsvfs->z_teardown_lock, FTAG); + return (EIO); + } + + /* + * At this point there are no vops active, and any new vops will + * fail with EIO since we have z_teardown_lock for writer (only + * relavent for forced unmount). + * + * Release all holds on dbufs. + */ + mutex_enter(&zfsvfs->z_znodes_lock); + for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL; + zp = list_next(&zfsvfs->z_all_znodes, zp)) + if (zp->z_dbuf) { + ASSERT(ZTOV(zp)->v_count > 0); + zfs_znode_dmu_fini(zp); + } + mutex_exit(&zfsvfs->z_znodes_lock); + + /* + * If we are unmounting, set the unmounted flag and let new vops + * unblock. zfs_inactive will have the unmounted behavior, and all + * other vops will fail with EIO. + */ + if (unmounting) { + zfsvfs->z_unmounted = B_TRUE; + rrw_exit(&zfsvfs->z_teardown_lock, FTAG); + rw_exit(&zfsvfs->z_teardown_inactive_lock); + } + + /* + * z_os will be NULL if there was an error in attempting to reopen + * zfsvfs, so just return as the properties had already been + * unregistered and cached data had been evicted before. + */ + if (zfsvfs->z_os == NULL) + return (0); + + /* + * Unregister properties. + */ + zfs_unregister_callbacks(zfsvfs); + + /* + * Evict cached data + */ + if (dmu_objset_evict_dbufs(zfsvfs->z_os)) { + txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); + (void) dmu_objset_evict_dbufs(zfsvfs->z_os); + } + + return (0); +} + +/*ARGSUSED*/ +static int +zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr) +{ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + objset_t *os; + int ret; + + ret = secpolicy_fs_unmount(cr, vfsp); + if (ret) { + ret = dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource), + ZFS_DELEG_PERM_MOUNT, cr); + if (ret) + return (ret); + } + + /* + * We purge the parent filesystem's vfsp as the parent filesystem + * and all of its snapshots have their vnode's v_vfsp set to the + * parent's filesystem's vfsp. Note, 'z_parent' is self + * referential for non-snapshots. + */ + (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0); + + /* + * Unmount any snapshots mounted under .zfs before unmounting the + * dataset itself. + */ + if (zfsvfs->z_ctldir != NULL && + (ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) { + return (ret); + } + + if (!(fflag & MS_FORCE)) { + /* + * Check the number of active vnodes in the file system. + * Our count is maintained in the vfs structure, but the + * number is off by 1 to indicate a hold on the vfs + * structure itself. + * + * The '.zfs' directory maintains a reference of its + * own, and any active references underneath are + * reflected in the vnode count. + */ + if (zfsvfs->z_ctldir == NULL) { + if (vfsp->vfs_count > 1) + return (EBUSY); + } else { + if (vfsp->vfs_count > 2 || + zfsvfs->z_ctldir->v_count > 1) + return (EBUSY); + } + } + + vfsp->vfs_flag |= VFS_UNMOUNTED; + + VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0); + os = zfsvfs->z_os; + + /* + * z_os will be NULL if there was an error in + * attempting to reopen zfsvfs. + */ + if (os != NULL) { + /* + * Unset the objset user_ptr. + */ + mutex_enter(&os->os->os_user_ptr_lock); + dmu_objset_set_user(os, NULL); + mutex_exit(&os->os->os_user_ptr_lock); + + /* + * Finally close the objset + */ + dmu_objset_close(os); + } + + /* + * We can now safely destroy the '.zfs' directory node. + */ + if (zfsvfs->z_ctldir != NULL) + zfsctl_destroy(zfsvfs); + + return (0); +} + +static int +zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp) +{ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + znode_t *zp; + uint64_t object = 0; + uint64_t fid_gen = 0; + uint64_t gen_mask; + uint64_t zp_gen; + int i, err; + + *vpp = NULL; + + ZFS_ENTER(zfsvfs); + + if (fidp->fid_len == LONG_FID_LEN) { + zfid_long_t *zlfid = (zfid_long_t *)fidp; + uint64_t objsetid = 0; + uint64_t setgen = 0; + + for (i = 0; i < sizeof (zlfid->zf_setid); i++) + objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i); + + for (i = 0; i < sizeof (zlfid->zf_setgen); i++) + setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i); + + ZFS_EXIT(zfsvfs); + + err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs); + if (err) + return (EINVAL); + ZFS_ENTER(zfsvfs); + } + + if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) { + zfid_short_t *zfid = (zfid_short_t *)fidp; + + for (i = 0; i < sizeof (zfid->zf_object); i++) + object |= ((uint64_t)zfid->zf_object[i]) << (8 * i); + + for (i = 0; i < sizeof (zfid->zf_gen); i++) + fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i); + } else { + ZFS_EXIT(zfsvfs); + return (EINVAL); + } + + /* A zero fid_gen means we are in the .zfs control directories */ + if (fid_gen == 0 && + (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) { + *vpp = zfsvfs->z_ctldir; + ASSERT(*vpp != NULL); + if (object == ZFSCTL_INO_SNAPDIR) { + VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL, + 0, NULL, NULL, NULL, NULL, NULL) == 0); + } else { + VN_HOLD(*vpp); + } + ZFS_EXIT(zfsvfs); + return (0); + } + + gen_mask = -1ULL >> (64 - 8 * i); + + dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask); + if (err = zfs_zget(zfsvfs, object, &zp)) { + ZFS_EXIT(zfsvfs); + return (err); + } + zp_gen = zp->z_phys->zp_gen & gen_mask; + if (zp_gen == 0) + zp_gen = 1; + if (zp->z_unlinked || zp_gen != fid_gen) { + dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen); + VN_RELE(ZTOV(zp)); + ZFS_EXIT(zfsvfs); + return (EINVAL); + } + + *vpp = ZTOV(zp); + ZFS_EXIT(zfsvfs); + return (0); +} + +/* + * Block out VOPs and close zfsvfs_t::z_os + * + * Note, if successful, then we return with the 'z_teardown_lock' and + * 'z_teardown_inactive_lock' write held. + */ +int +zfs_suspend_fs(zfsvfs_t *zfsvfs, char *name, int *mode) +{ + int error; + + if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) + return (error); + + *mode = zfsvfs->z_os->os_mode; + dmu_objset_name(zfsvfs->z_os, name); + dmu_objset_close(zfsvfs->z_os); + + return (0); +} + +/* + * Reopen zfsvfs_t::z_os and release VOPs. + */ +int +zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname, int mode) +{ + int err; + + ASSERT(RRW_WRITE_HELD(&zfsvfs->z_teardown_lock)); + ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); + + err = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os); + if (err) { + zfsvfs->z_os = NULL; + } else { + znode_t *zp; + + VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0); + + /* + * Attempt to re-establish all the active znodes with + * their dbufs. If a zfs_rezget() fails, then we'll let + * any potential callers discover that via ZFS_ENTER_VERIFY_VP + * when they try to use their znode. + */ + mutex_enter(&zfsvfs->z_znodes_lock); + for (zp = list_head(&zfsvfs->z_all_znodes); zp; + zp = list_next(&zfsvfs->z_all_znodes, zp)) { + (void) zfs_rezget(zp); + } + mutex_exit(&zfsvfs->z_znodes_lock); + + } + + /* release the VOPs */ + rw_exit(&zfsvfs->z_teardown_inactive_lock); + rrw_exit(&zfsvfs->z_teardown_lock, FTAG); + + if (err) { + /* + * Since we couldn't reopen zfsvfs::z_os, force + * unmount this file system. + */ + if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) + (void) dounmount(zfsvfs->z_vfs, MS_FORCE, CRED()); + } + return (err); +} + +static void +zfs_freevfs(vfs_t *vfsp) +{ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + int i; + + for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) + mutex_destroy(&zfsvfs->z_hold_mtx[i]); + + zfs_fuid_destroy(zfsvfs); + zfs_freezfsvfs(zfsvfs); + + atomic_add_32(&zfs_active_fs_count, -1); +} + +/* + * VFS_INIT() initialization. Note that there is no VFS_FINI(), + * so we can't safely do any non-idempotent initialization here. + * Leave that to zfs_init() and zfs_fini(), which are called + * from the module's _init() and _fini() entry points. + */ +/*ARGSUSED*/ +static int +zfs_vfsinit(int fstype, char *name) +{ + int error; + + zfsfstype = fstype; + + /* + * Setup vfsops and vnodeops tables. + */ + error = vfs_setfsops(fstype, zfs_vfsops_template, &zfs_vfsops); + if (error != 0) { + cmn_err(CE_WARN, "zfs: bad vfs ops template"); + } + + error = zfs_create_op_tables(); + if (error) { + zfs_remove_op_tables(); + cmn_err(CE_WARN, "zfs: bad vnode ops template"); + (void) vfs_freevfsops_by_type(zfsfstype); + return (error); + } + + mutex_init(&zfs_dev_mtx, NULL, MUTEX_DEFAULT, NULL); + + /* + * Unique major number for all zfs mounts. + * If we run out of 32-bit minors, we'll getudev() another major. + */ + zfs_major = ddi_name_to_major(ZFS_DRIVER); + zfs_minor = ZFS_MIN_MINOR; + + return (0); +} + +void +zfs_init(void) +{ + /* + * Initialize .zfs directory structures + */ + zfsctl_init(); + + /* + * Initialize znode cache, vnode ops, etc... + */ + zfs_znode_init(); +} + +void +zfs_fini(void) +{ + zfsctl_fini(); + zfs_znode_fini(); +} + +int +zfs_busy(void) +{ + return (zfs_active_fs_count != 0); +} + +int +zfs_set_version(const char *name, uint64_t newvers) +{ + int error; + objset_t *os; + dmu_tx_t *tx; + uint64_t curvers; + + /* + * XXX for now, require that the filesystem be unmounted. Would + * be nice to find the zfsvfs_t and just update that if + * possible. + */ + + if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION) + return (EINVAL); + + error = dmu_objset_open(name, DMU_OST_ZFS, DS_MODE_PRIMARY, &os); + if (error) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, + 8, 1, &curvers); + if (error) + goto out; + if (newvers < curvers) { + error = EINVAL; + goto out; + } + + tx = dmu_tx_create(os); + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, 0, ZPL_VERSION_STR); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + goto out; + } + error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 8, 1, + &newvers, tx); + + spa_history_internal_log(LOG_DS_UPGRADE, + dmu_objset_spa(os), tx, CRED(), + "oldver=%llu newver=%llu dataset = %llu", curvers, newvers, + dmu_objset_id(os)); + dmu_tx_commit(tx); + +out: + dmu_objset_close(os); + return (error); +} + +/* + * Read a property stored within the master node. + */ +int +zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) +{ + const char *pname; + int error; + + /* + * Look up the file system's value for the property. For the + * version property, we look up a slightly different string. + */ + if (prop == ZFS_PROP_VERSION) + pname = ZPL_VERSION_STR; + else + pname = zfs_prop_to_name(prop); + + error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); + + if (error == ENOENT) { + /* No value set, use the default value */ + switch (prop) { + case ZFS_PROP_VERSION: + *value = ZPL_VERSION; + break; + case ZFS_PROP_NORMALIZE: + case ZFS_PROP_UTF8ONLY: + *value = 0; + break; + case ZFS_PROP_CASE: + *value = ZFS_CASE_SENSITIVE; + break; + default: + return (error); + } + error = 0; + } + return (error); +} + +static vfsdef_t vfw = { + VFSDEF_VERSION, + MNTTYPE_ZFS, + zfs_vfsinit, + VSW_HASPROTO|VSW_CANRWRO|VSW_CANREMOUNT|VSW_VOLATILEDEV|VSW_STATS| + VSW_XID, + &zfs_mntopts +}; + +struct modlfs zfs_modlfs = { + &mod_fsops, "ZFS filesystem version " SPA_VERSION_STRING, &vfw +}; diff --git a/zfs/lib/libdmu-ctl/zfs_vnops.c b/zfs/lib/libdmu-ctl/zfs_vnops.c new file mode 100644 index 0000000000..3f36328de3 --- /dev/null +++ b/zfs/lib/libdmu-ctl/zfs_vnops.c @@ -0,0 +1,4558 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Portions Copyright 2007 Jeremy Teo */ + +#pragma ident "@(#)zfs_vnops.c 1.73 08/04/27 SMI" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "fs/fs_subr.h" +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Programming rules. + * + * Each vnode op performs some logical unit of work. To do this, the ZPL must + * properly lock its in-core state, create a DMU transaction, do the work, + * record this work in the intent log (ZIL), commit the DMU transaction, + * and wait for the intent log to commit if it is a synchronous operation. + * Moreover, the vnode ops must work in both normal and log replay context. + * The ordering of events is important to avoid deadlocks and references + * to freed memory. The example below illustrates the following Big Rules: + * + * (1) A check must be made in each zfs thread for a mounted file system. + * This is done avoiding races using ZFS_ENTER(zfsvfs). + * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes + * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros + * can return EIO from the calling function. + * + * (2) VN_RELE() should always be the last thing except for zil_commit() + * (if necessary) and ZFS_EXIT(). This is for 3 reasons: + * First, if it's the last reference, the vnode/znode + * can be freed, so the zp may point to freed memory. Second, the last + * reference will call zfs_zinactive(), which may induce a lot of work -- + * pushing cached pages (which acquires range locks) and syncing out + * cached atime changes. Third, zfs_zinactive() may require a new tx, + * which could deadlock the system if you were already holding one. + * + * (3) All range locks must be grabbed before calling dmu_tx_assign(), + * as they can span dmu_tx_assign() calls. + * + * (4) Always pass zfsvfs->z_assign as the second argument to dmu_tx_assign(). + * In normal operation, this will be TXG_NOWAIT. During ZIL replay, + * it will be a specific txg. Either way, dmu_tx_assign() never blocks. + * This is critical because we don't want to block while holding locks. + * Note, in particular, that if a lock is sometimes acquired before + * the tx assigns, and sometimes after (e.g. z_lock), then failing to + * use a non-blocking assign can deadlock the system. The scenario: + * + * Thread A has grabbed a lock before calling dmu_tx_assign(). + * Thread B is in an already-assigned tx, and blocks for this lock. + * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() + * forever, because the previous txg can't quiesce until B's tx commits. + * + * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, + * then drop all locks, call dmu_tx_wait(), and try again. + * + * (5) If the operation succeeded, generate the intent log entry for it + * before dropping locks. This ensures that the ordering of events + * in the intent log matches the order in which they actually occurred. + * + * (6) At the end of each vnode op, the DMU tx must always commit, + * regardless of whether there were any errors. + * + * (7) After dropping all locks, invoke zil_commit(zilog, seq, foid) + * to ensure that synchronous semantics are provided when necessary. + * + * In general, this is how things should be ordered in each vnode op: + * + * ZFS_ENTER(zfsvfs); // exit if unmounted + * top: + * zfs_dirent_lock(&dl, ...) // lock directory entry (may VN_HOLD()) + * rw_enter(...); // grab any other locks you need + * tx = dmu_tx_create(...); // get DMU tx + * dmu_tx_hold_*(); // hold each object you might modify + * error = dmu_tx_assign(tx, zfsvfs->z_assign); // try to assign + * if (error) { + * rw_exit(...); // drop locks + * zfs_dirent_unlock(dl); // unlock directory entry + * VN_RELE(...); // release held vnodes + * if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + * dmu_tx_wait(tx); + * dmu_tx_abort(tx); + * goto top; + * } + * dmu_tx_abort(tx); // abort DMU tx + * ZFS_EXIT(zfsvfs); // finished in zfs + * return (error); // really out of space + * } + * error = do_real_work(); // do whatever this VOP does + * if (error == 0) + * zfs_log_*(...); // on success, make ZIL entry + * dmu_tx_commit(tx); // commit DMU tx -- error or not + * rw_exit(...); // drop locks + * zfs_dirent_unlock(dl); // unlock directory entry + * VN_RELE(...); // release held vnodes + * zil_commit(zilog, seq, foid); // synchronous when necessary + * ZFS_EXIT(zfsvfs); // finished in zfs + * return (error); // done, report error + */ + +/* ARGSUSED */ +static int +zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) +{ + znode_t *zp = VTOZ(*vpp); + + if ((flag & FWRITE) && (zp->z_phys->zp_flags & ZFS_APPENDONLY) && + ((flag & FAPPEND) == 0)) { + return (EPERM); + } + + if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && + ZTOV(zp)->v_type == VREG && + !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) && + zp->z_phys->zp_size > 0) + if (fs_vscan(*vpp, cr, 0) != 0) + return (EACCES); + + /* Keep a count of the synchronous opens in the znode */ + if (flag & (FSYNC | FDSYNC)) + atomic_inc_32(&zp->z_sync_cnt); + + return (0); +} + +/* ARGSUSED */ +static int +zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, + caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + + /* Decrement the synchronous opens in the znode */ + if ((flag & (FSYNC | FDSYNC)) && (count == 1)) + atomic_dec_32(&zp->z_sync_cnt); + + /* + * Clean up any locks held by this process on the vp. + */ + cleanlocks(vp, ddi_get_pid(), 0); + cleanshares(vp, ddi_get_pid()); + + if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && + ZTOV(zp)->v_type == VREG && + !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) && + zp->z_phys->zp_size > 0) + VERIFY(fs_vscan(vp, cr, 1) == 0); + + return (0); +} + +/* + * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and + * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter. + */ +static int +zfs_holey(vnode_t *vp, int cmd, offset_t *off) +{ + znode_t *zp = VTOZ(vp); + uint64_t noff = (uint64_t)*off; /* new offset */ + uint64_t file_sz; + int error; + boolean_t hole; + + file_sz = zp->z_phys->zp_size; + if (noff >= file_sz) { + return (ENXIO); + } + + if (cmd == _FIO_SEEK_HOLE) + hole = B_TRUE; + else + hole = B_FALSE; + + error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff); + + /* end of file? */ + if ((error == ESRCH) || (noff > file_sz)) { + /* + * Handle the virtual hole at the end of file. + */ + if (hole) { + *off = file_sz; + return (0); + } + return (ENXIO); + } + + if (noff < *off) + return (error); + *off = noff; + return (error); +} + +/* ARGSUSED */ +static int +zfs_ioctl(vnode_t *vp, int com, intptr_t data, int flag, cred_t *cred, + int *rvalp, caller_context_t *ct) +{ + offset_t off; + int error; + zfsvfs_t *zfsvfs; + znode_t *zp; + + switch (com) { + case _FIOFFS: + return (zfs_sync(vp->v_vfsp, 0, cred)); + + /* + * The following two ioctls are used by bfu. Faking out, + * necessary to avoid bfu errors. + */ + case _FIOGDIO: + case _FIOSDIO: + return (0); + + case _FIO_SEEK_DATA: + case _FIO_SEEK_HOLE: + if (ddi_copyin((void *)data, &off, sizeof (off), flag)) + return (EFAULT); + + zp = VTOZ(vp); + zfsvfs = zp->z_zfsvfs; + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + /* offset parameter is in/out */ + error = zfs_holey(vp, com, &off); + ZFS_EXIT(zfsvfs); + if (error) + return (error); + if (ddi_copyout(&off, (void *)data, sizeof (off), flag)) + return (EFAULT); + return (0); + } + return (ENOTTY); +} + +/* + * When a file is memory mapped, we must keep the IO data synchronized + * between the DMU cache and the memory mapped pages. What this means: + * + * On Write: If we find a memory mapped page, we write to *both* + * the page and the dmu buffer. + * + * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when + * the file is memory mapped. + */ +static int +mappedwrite(vnode_t *vp, int nbytes, uio_t *uio, dmu_tx_t *tx) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int64_t start, off; + int len = nbytes; + int error = 0; + + start = uio->uio_loffset; + off = start & PAGEOFFSET; + for (start &= PAGEMASK; len > 0; start += PAGESIZE) { + page_t *pp; + uint64_t bytes = MIN(PAGESIZE - off, len); + uint64_t woff = uio->uio_loffset; + + /* + * We don't want a new page to "appear" in the middle of + * the file update (because it may not get the write + * update data), so we grab a lock to block + * zfs_getpage(). + */ + rw_enter(&zp->z_map_lock, RW_WRITER); + if (pp = page_lookup(vp, start, SE_SHARED)) { + caddr_t va; + + rw_exit(&zp->z_map_lock); + va = ppmapin(pp, PROT_READ | PROT_WRITE, (caddr_t)-1L); + error = uiomove(va+off, bytes, UIO_WRITE, uio); + if (error == 0) { + dmu_write(zfsvfs->z_os, zp->z_id, + woff, bytes, va+off, tx); + } + ppmapout(va); + page_unlock(pp); + } else { + error = dmu_write_uio(zfsvfs->z_os, zp->z_id, + uio, bytes, tx); + rw_exit(&zp->z_map_lock); + } + len -= bytes; + off = 0; + if (error) + break; + } + return (error); +} + +/* + * When a file is memory mapped, we must keep the IO data synchronized + * between the DMU cache and the memory mapped pages. What this means: + * + * On Read: We "read" preferentially from memory mapped pages, + * else we default from the dmu buffer. + * + * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when + * the file is memory mapped. + */ +static int +mappedread(vnode_t *vp, int nbytes, uio_t *uio) +{ + znode_t *zp = VTOZ(vp); + objset_t *os = zp->z_zfsvfs->z_os; + int64_t start, off; + int len = nbytes; + int error = 0; + + start = uio->uio_loffset; + off = start & PAGEOFFSET; + for (start &= PAGEMASK; len > 0; start += PAGESIZE) { + page_t *pp; + uint64_t bytes = MIN(PAGESIZE - off, len); + + if (pp = page_lookup(vp, start, SE_SHARED)) { + caddr_t va; + + va = ppmapin(pp, PROT_READ, (caddr_t)-1L); + error = uiomove(va + off, bytes, UIO_READ, uio); + ppmapout(va); + page_unlock(pp); + } else { + error = dmu_read_uio(os, zp->z_id, uio, bytes); + } + len -= bytes; + off = 0; + if (error) + break; + } + return (error); +} + +offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */ + +/* + * Read bytes from specified file into supplied buffer. + * + * IN: vp - vnode of file to be read from. + * uio - structure supplying read location, range info, + * and return buffer. + * ioflag - SYNC flags; used to provide FRSYNC semantics. + * cr - credentials of caller. + * ct - caller context + * + * OUT: uio - updated offset and range, buffer filled. + * + * RETURN: 0 if success + * error code if failure + * + * Side Effects: + * vp - atime updated if byte count > 0 + */ +/* ARGSUSED */ +static int +zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + objset_t *os; + ssize_t n, nbytes; + int error; + rl_t *rl; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + os = zfsvfs->z_os; + + if (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) { + ZFS_EXIT(zfsvfs); + return (EACCES); + } + + /* + * Validate file offset + */ + if (uio->uio_loffset < (offset_t)0) { + ZFS_EXIT(zfsvfs); + return (EINVAL); + } + + /* + * Fasttrack empty reads + */ + if (uio->uio_resid == 0) { + ZFS_EXIT(zfsvfs); + return (0); + } + + /* + * Check for mandatory locks + */ + if (MANDMODE((mode_t)zp->z_phys->zp_mode)) { + if (error = chklock(vp, FREAD, + uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) { + ZFS_EXIT(zfsvfs); + return (error); + } + } + + /* + * If we're in FRSYNC mode, sync out this znode before reading it. + */ + if (ioflag & FRSYNC) + zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id); + + /* + * Lock the range against changes. + */ + rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER); + + /* + * If we are reading past end-of-file we can skip + * to the end; but we might still need to set atime. + */ + if (uio->uio_loffset >= zp->z_phys->zp_size) { + error = 0; + goto out; + } + + ASSERT(uio->uio_loffset < zp->z_phys->zp_size); + n = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset); + + while (n > 0) { + nbytes = MIN(n, zfs_read_chunk_size - + P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); + + if (vn_has_cached_data(vp)) + error = mappedread(vp, nbytes, uio); + else + error = dmu_read_uio(os, zp->z_id, uio, nbytes); + if (error) + break; + + n -= nbytes; + } + +out: + zfs_range_unlock(rl); + + ZFS_ACCESSTIME_STAMP(zfsvfs, zp); + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Fault in the pages of the first n bytes specified by the uio structure. + * 1 byte in each page is touched and the uio struct is unmodified. + * Any error will exit this routine as this is only a best + * attempt to get the pages resident. This is a copy of ufs_trans_touch(). + */ +static void +zfs_prefault_write(ssize_t n, struct uio *uio) +{ + struct iovec *iov; + ulong_t cnt, incr; + caddr_t p; + uint8_t tmp; + + iov = uio->uio_iov; + + while (n) { + cnt = MIN(iov->iov_len, n); + if (cnt == 0) { + /* empty iov entry */ + iov++; + continue; + } + n -= cnt; + /* + * touch each page in this segment. + */ + p = iov->iov_base; + while (cnt) { + switch (uio->uio_segflg) { + case UIO_USERSPACE: + case UIO_USERISPACE: + if (fuword8(p, &tmp)) + return; + break; + case UIO_SYSSPACE: + if (kcopy(p, &tmp, 1)) + return; + break; + } + incr = MIN(cnt, PAGESIZE); + p += incr; + cnt -= incr; + } + /* + * touch the last byte in case it straddles a page. + */ + p--; + switch (uio->uio_segflg) { + case UIO_USERSPACE: + case UIO_USERISPACE: + if (fuword8(p, &tmp)) + return; + break; + case UIO_SYSSPACE: + if (kcopy(p, &tmp, 1)) + return; + break; + } + iov++; + } +} + +/* + * Write the bytes to a file. + * + * IN: vp - vnode of file to be written to. + * uio - structure supplying write location, range info, + * and data buffer. + * ioflag - FAPPEND flag set if in append mode. + * cr - credentials of caller. + * ct - caller context (NFS/CIFS fem monitor only) + * + * OUT: uio - updated offset and range. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * vp - ctime|mtime updated if byte count > 0 + */ +/* ARGSUSED */ +static int +zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + rlim64_t limit = uio->uio_llimit; + ssize_t start_resid = uio->uio_resid; + ssize_t tx_bytes; + uint64_t end_size; + dmu_tx_t *tx; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zilog_t *zilog; + offset_t woff; + ssize_t n, nbytes; + rl_t *rl; + int max_blksz = zfsvfs->z_max_blksz; + uint64_t pflags = zp->z_phys->zp_flags; + int error; + + /* + * If immutable or not appending then return EPERM + */ + if ((pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) || + ((pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) && + (uio->uio_loffset < zp->z_phys->zp_size))) + return (EPERM); + + /* + * Fasttrack empty write + */ + n = start_resid; + if (n == 0) + return (0); + + if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) + limit = MAXOFFSET_T; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + zilog = zfsvfs->z_log; + + /* + * Pre-fault the pages to ensure slow (eg NFS) pages + * don't hold up txg. + */ + zfs_prefault_write(n, uio); + + /* + * If in append mode, set the io offset pointer to eof. + */ + if (ioflag & FAPPEND) { + /* + * Range lock for a file append: + * The value for the start of range will be determined by + * zfs_range_lock() (to guarantee append semantics). + * If this write will cause the block size to increase, + * zfs_range_lock() will lock the entire file, so we must + * later reduce the range after we grow the block size. + */ + rl = zfs_range_lock(zp, 0, n, RL_APPEND); + if (rl->r_len == UINT64_MAX) { + /* overlocked, zp_size can't change */ + woff = uio->uio_loffset = zp->z_phys->zp_size; + } else { + woff = uio->uio_loffset = rl->r_off; + } + } else { + woff = uio->uio_loffset; + /* + * Validate file offset + */ + if (woff < 0) { + ZFS_EXIT(zfsvfs); + return (EINVAL); + } + + /* + * If we need to grow the block size then zfs_range_lock() + * will lock a wider range than we request here. + * Later after growing the block size we reduce the range. + */ + rl = zfs_range_lock(zp, woff, n, RL_WRITER); + } + + if (woff >= limit) { + zfs_range_unlock(rl); + ZFS_EXIT(zfsvfs); + return (EFBIG); + } + + if ((woff + n) > limit || woff > (limit - n)) + n = limit - woff; + + /* + * Check for mandatory locks + */ + if (MANDMODE((mode_t)zp->z_phys->zp_mode) && + (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) { + zfs_range_unlock(rl); + ZFS_EXIT(zfsvfs); + return (error); + } + end_size = MAX(zp->z_phys->zp_size, woff + n); + + /* + * Write the file in reasonable size chunks. Each chunk is written + * in a separate transaction; this keeps the intent log records small + * and allows us to do more fine-grained space accounting. + */ + while (n > 0) { + /* + * Start a transaction. + */ + woff = uio->uio_loffset; + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_bonus(tx, zp->z_id); + dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); + error = dmu_tx_assign(tx, zfsvfs->z_assign); + if (error) { + if (error == ERESTART && + zfsvfs->z_assign == TXG_NOWAIT) { + dmu_tx_wait(tx); + dmu_tx_abort(tx); + continue; + } + dmu_tx_abort(tx); + break; + } + + /* + * If zfs_range_lock() over-locked we grow the blocksize + * and then reduce the lock range. This will only happen + * on the first iteration since zfs_range_reduce() will + * shrink down r_len to the appropriate size. + */ + if (rl->r_len == UINT64_MAX) { + uint64_t new_blksz; + + if (zp->z_blksz > max_blksz) { + ASSERT(!ISP2(zp->z_blksz)); + new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE); + } else { + new_blksz = MIN(end_size, max_blksz); + } + zfs_grow_blocksize(zp, new_blksz, tx); + zfs_range_reduce(rl, woff, n); + } + + /* + * XXX - should we really limit each write to z_max_blksz? + * Perhaps we should use SPA_MAXBLOCKSIZE chunks? + */ + nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); + rw_enter(&zp->z_map_lock, RW_READER); + + tx_bytes = uio->uio_resid; + if (vn_has_cached_data(vp)) { + rw_exit(&zp->z_map_lock); + error = mappedwrite(vp, nbytes, uio, tx); + } else { + error = dmu_write_uio(zfsvfs->z_os, zp->z_id, + uio, nbytes, tx); + rw_exit(&zp->z_map_lock); + } + tx_bytes -= uio->uio_resid; + + /* + * If we made no progress, we're done. If we made even + * partial progress, update the znode and ZIL accordingly. + */ + if (tx_bytes == 0) { + dmu_tx_commit(tx); + ASSERT(error != 0); + break; + } + + /* + * Clear Set-UID/Set-GID bits on successful write if not + * privileged and at least one of the excute bits is set. + * + * It would be nice to to this after all writes have + * been done, but that would still expose the ISUID/ISGID + * to another app after the partial write is committed. + * + * Note: we don't call zfs_fuid_map_id() here because + * user 0 is not an ephemeral uid. + */ + mutex_enter(&zp->z_acl_lock); + if ((zp->z_phys->zp_mode & (S_IXUSR | (S_IXUSR >> 3) | + (S_IXUSR >> 6))) != 0 && + (zp->z_phys->zp_mode & (S_ISUID | S_ISGID)) != 0 && + secpolicy_vnode_setid_retain(cr, + (zp->z_phys->zp_mode & S_ISUID) != 0 && + zp->z_phys->zp_uid == 0) != 0) { + zp->z_phys->zp_mode &= ~(S_ISUID | S_ISGID); + } + mutex_exit(&zp->z_acl_lock); + + /* + * Update time stamp. NOTE: This marks the bonus buffer as + * dirty, so we don't have to do it again for zp_size. + */ + zfs_time_stamper(zp, CONTENT_MODIFIED, tx); + + /* + * Update the file size (zp_size) if it has changed; + * account for possible concurrent updates. + */ + while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset) + (void) atomic_cas_64(&zp->z_phys->zp_size, end_size, + uio->uio_loffset); + zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag); + dmu_tx_commit(tx); + + if (error != 0) + break; + ASSERT(tx_bytes == nbytes); + n -= nbytes; + } + + zfs_range_unlock(rl); + + /* + * If we're in replay mode, or we made no progress, return error. + * Otherwise, it's at least a partial write, so it's successful. + */ + if (zfsvfs->z_assign >= TXG_INITIAL || uio->uio_resid == start_resid) { + ZFS_EXIT(zfsvfs); + return (error); + } + + if (ioflag & (FSYNC | FDSYNC)) + zil_commit(zilog, zp->z_last_itx, zp->z_id); + + ZFS_EXIT(zfsvfs); + return (0); +} + +void +zfs_get_done(dmu_buf_t *db, void *vzgd) +{ + zgd_t *zgd = (zgd_t *)vzgd; + rl_t *rl = zgd->zgd_rl; + vnode_t *vp = ZTOV(rl->r_zp); + + dmu_buf_rele(db, vzgd); + zfs_range_unlock(rl); + VN_RELE(vp); + zil_add_block(zgd->zgd_zilog, zgd->zgd_bp); + kmem_free(zgd, sizeof (zgd_t)); +} + +/* + * Get data to generate a TX_WRITE intent log record. + */ +int +zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) +{ + zfsvfs_t *zfsvfs = arg; + objset_t *os = zfsvfs->z_os; + znode_t *zp; + uint64_t off = lr->lr_offset; + dmu_buf_t *db; + rl_t *rl; + zgd_t *zgd; + int dlen = lr->lr_length; /* length of user data */ + int error = 0; + + ASSERT(zio); + ASSERT(dlen != 0); + + /* + * Nothing to do if the file has been removed + */ + if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0) + return (ENOENT); + if (zp->z_unlinked) { + VN_RELE(ZTOV(zp)); + return (ENOENT); + } + + /* + * Write records come in two flavors: immediate and indirect. + * For small writes it's cheaper to store the data with the + * log record (immediate); for large writes it's cheaper to + * sync the data and get a pointer to it (indirect) so that + * we don't have to write the data twice. + */ + if (buf != NULL) { /* immediate write */ + rl = zfs_range_lock(zp, off, dlen, RL_READER); + /* test for truncation needs to be done while range locked */ + if (off >= zp->z_phys->zp_size) { + error = ENOENT; + goto out; + } + VERIFY(0 == dmu_read(os, lr->lr_foid, off, dlen, buf)); + } else { /* indirect write */ + uint64_t boff; /* block starting offset */ + + /* + * Have to lock the whole block to ensure when it's + * written out and it's checksum is being calculated + * that no one can change the data. We need to re-check + * blocksize after we get the lock in case it's changed! + */ + for (;;) { + if (ISP2(zp->z_blksz)) { + boff = P2ALIGN_TYPED(off, zp->z_blksz, + uint64_t); + } else { + boff = 0; + } + dlen = zp->z_blksz; + rl = zfs_range_lock(zp, boff, dlen, RL_READER); + if (zp->z_blksz == dlen) + break; + zfs_range_unlock(rl); + } + /* test for truncation needs to be done while range locked */ + if (off >= zp->z_phys->zp_size) { + error = ENOENT; + goto out; + } + zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP); + zgd->zgd_rl = rl; + zgd->zgd_zilog = zfsvfs->z_log; + zgd->zgd_bp = &lr->lr_blkptr; + VERIFY(0 == dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db)); + ASSERT(boff == db->db_offset); + lr->lr_blkoff = off - boff; + error = dmu_sync(zio, db, &lr->lr_blkptr, + lr->lr_common.lrc_txg, zfs_get_done, zgd); + ASSERT((error && error != EINPROGRESS) || + lr->lr_length <= zp->z_blksz); + if (error == 0) + zil_add_block(zfsvfs->z_log, &lr->lr_blkptr); + /* + * If we get EINPROGRESS, then we need to wait for a + * write IO initiated by dmu_sync() to complete before + * we can release this dbuf. We will finish everything + * up in the zfs_get_done() callback. + */ + if (error == EINPROGRESS) + return (0); + dmu_buf_rele(db, zgd); + kmem_free(zgd, sizeof (zgd_t)); + } +out: + zfs_range_unlock(rl); + VN_RELE(ZTOV(zp)); + return (error); +} + +/*ARGSUSED*/ +static int +zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr, + caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if (flag & V_ACE_MASK) + error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); + else + error = zfs_zaccess_rwx(zp, mode, flag, cr); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Lookup an entry in a directory, or an extended attribute directory. + * If it exists, return a held vnode reference for it. + * + * IN: dvp - vnode of directory to search. + * nm - name of entry to lookup. + * pnp - full pathname to lookup [UNUSED]. + * flags - LOOKUP_XATTR set if looking for an attribute. + * rdir - root directory vnode [UNUSED]. + * cr - credentials of caller. + * ct - caller context + * direntflags - directory lookup flags + * realpnp - returned pathname. + * + * OUT: vpp - vnode of located entry, NULL if not found. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * NA + */ +/* ARGSUSED */ +static int +zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, + int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, + int *direntflags, pathname_t *realpnp) +{ + znode_t *zdp = VTOZ(dvp); + zfsvfs_t *zfsvfs = zdp->z_zfsvfs; + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zdp); + + *vpp = NULL; + + if (flags & LOOKUP_XATTR) { + /* + * If the xattr property is off, refuse the lookup request. + */ + if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) { + ZFS_EXIT(zfsvfs); + return (EINVAL); + } + + /* + * We don't allow recursive attributes.. + * Maybe someday we will. + */ + if (zdp->z_phys->zp_flags & ZFS_XATTR) { + ZFS_EXIT(zfsvfs); + return (EINVAL); + } + + if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) { + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Do we have permission to get into attribute directory? + */ + + if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0, + B_FALSE, cr)) { + VN_RELE(*vpp); + *vpp = NULL; + } + + ZFS_EXIT(zfsvfs); + return (error); + } + + if (dvp->v_type != VDIR) { + ZFS_EXIT(zfsvfs); + return (ENOTDIR); + } + + /* + * Check accessibility of directory. + */ + + if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) { + ZFS_EXIT(zfsvfs); + return (error); + } + + if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), + NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (EILSEQ); + } + + error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp); + if (error == 0) { + /* + * Convert device special files + */ + if (IS_DEVVP(*vpp)) { + vnode_t *svp; + + svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); + VN_RELE(*vpp); + if (svp == NULL) + error = ENOSYS; + else + *vpp = svp; + } + } + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Attempt to create a new entry in a directory. If the entry + * already exists, truncate the file if permissible, else return + * an error. Return the vp of the created or trunc'd file. + * + * IN: dvp - vnode of directory to put new file entry in. + * name - name of new file entry. + * vap - attributes of new file. + * excl - flag indicating exclusive or non-exclusive mode. + * mode - mode to open file with. + * cr - credentials of caller. + * flag - large file flag [UNUSED]. + * ct - caller context + * vsecp - ACL to be set + * + * OUT: vpp - vnode of created or trunc'd entry. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * dvp - ctime|mtime updated if new entry created + * vp - ctime|mtime always, atime if new + */ + +/* ARGSUSED */ +static int +zfs_create(vnode_t *dvp, char *name, vattr_t *vap, vcexcl_t excl, + int mode, vnode_t **vpp, cred_t *cr, int flag, caller_context_t *ct, + vsecattr_t *vsecp) +{ + znode_t *zp, *dzp = VTOZ(dvp); + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zilog_t *zilog; + objset_t *os; + zfs_dirlock_t *dl; + dmu_tx_t *tx; + int error; + zfs_acl_t *aclp = NULL; + zfs_fuid_info_t *fuidp = NULL; + + /* + * If we have an ephemeral id, ACL, or XVATTR then + * make sure file system is at proper version + */ + + if (zfsvfs->z_use_fuids == B_FALSE && + (vsecp || (vap->va_mask & AT_XVATTR) || + IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr)))) + return (EINVAL); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + os = zfsvfs->z_os; + zilog = zfsvfs->z_log; + + if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), + NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (EILSEQ); + } + + if (vap->va_mask & AT_XVATTR) { + if ((error = secpolicy_xvattr((xvattr_t *)vap, + crgetuid(cr), cr, vap->va_type)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + } +top: + *vpp = NULL; + + if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr)) + vap->va_mode &= ~VSVTX; + + if (*name == '\0') { + /* + * Null component name refers to the directory itself. + */ + VN_HOLD(dvp); + zp = dzp; + dl = NULL; + error = 0; + } else { + /* possible VN_HOLD(zp) */ + int zflg = 0; + + if (flag & FIGNORECASE) + zflg |= ZCILOOK; + + error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, + NULL, NULL); + if (error) { + if (strcmp(name, "..") == 0) + error = EISDIR; + ZFS_EXIT(zfsvfs); + if (aclp) + zfs_acl_free(aclp); + return (error); + } + } + if (vsecp && aclp == NULL) { + error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, &aclp); + if (error) { + ZFS_EXIT(zfsvfs); + if (dl) + zfs_dirent_unlock(dl); + return (error); + } + } + + if (zp == NULL) { + uint64_t txtype; + + /* + * Create a new file object and update the directory + * to reference it. + */ + if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { + goto out; + } + + /* + * We only support the creation of regular files in + * extended attribute directories. + */ + if ((dzp->z_phys->zp_flags & ZFS_XATTR) && + (vap->va_type != VREG)) { + error = EINVAL; + goto out; + } + + tx = dmu_tx_create(os); + dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); + if ((aclp && aclp->z_has_fuids) || IS_EPHEMERAL(crgetuid(cr)) || + IS_EPHEMERAL(crgetgid(cr))) { + if (zfsvfs->z_fuid_obj == 0) { + dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + FUID_SIZE_ESTIMATE(zfsvfs)); + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, + FALSE, NULL); + } else { + dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); + dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, + FUID_SIZE_ESTIMATE(zfsvfs)); + } + } + dmu_tx_hold_bonus(tx, dzp->z_id); + dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); + if ((dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) || aclp) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, SPA_MAXBLOCKSIZE); + } + error = dmu_tx_assign(tx, zfsvfs->z_assign); + if (error) { + zfs_dirent_unlock(dl); + if (error == ERESTART && + zfsvfs->z_assign == TXG_NOWAIT) { + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + if (aclp) + zfs_acl_free(aclp); + return (error); + } + zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, aclp, &fuidp); + (void) zfs_link_create(dl, zp, tx, ZNEW); + txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); + if (flag & FIGNORECASE) + txtype |= TX_CI; + zfs_log_create(zilog, tx, txtype, dzp, zp, name, + vsecp, fuidp, vap); + if (fuidp) + zfs_fuid_info_free(fuidp); + dmu_tx_commit(tx); + } else { + int aflags = (flag & FAPPEND) ? V_APPEND : 0; + + /* + * A directory entry already exists for this name. + */ + /* + * Can't truncate an existing file if in exclusive mode. + */ + if (excl == EXCL) { + error = EEXIST; + goto out; + } + /* + * Can't open a directory for writing. + */ + if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) { + error = EISDIR; + goto out; + } + /* + * Verify requested access to file. + */ + if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) { + goto out; + } + + mutex_enter(&dzp->z_lock); + dzp->z_seq++; + mutex_exit(&dzp->z_lock); + + /* + * Truncate regular files if requested. + */ + if ((ZTOV(zp)->v_type == VREG) && + (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) { + error = zfs_freesp(zp, 0, 0, mode, TRUE); + if (error == ERESTART && + zfsvfs->z_assign == TXG_NOWAIT) { + /* NB: we already did dmu_tx_wait() */ + zfs_dirent_unlock(dl); + VN_RELE(ZTOV(zp)); + goto top; + } + + if (error == 0) { + vnevent_create(ZTOV(zp), ct); + } + } + } +out: + + if (dl) + zfs_dirent_unlock(dl); + + if (error) { + if (zp) + VN_RELE(ZTOV(zp)); + } else { + *vpp = ZTOV(zp); + /* + * If vnode is for a device return a specfs vnode instead. + */ + if (IS_DEVVP(*vpp)) { + struct vnode *svp; + + svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); + VN_RELE(*vpp); + if (svp == NULL) { + error = ENOSYS; + } + *vpp = svp; + } + } + if (aclp) + zfs_acl_free(aclp); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Remove an entry from a directory. + * + * IN: dvp - vnode of directory to remove entry from. + * name - name of entry to remove. + * cr - credentials of caller. + * ct - caller context + * flags - case flags + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * dvp - ctime|mtime + * vp - ctime (if nlink > 0) + */ +/*ARGSUSED*/ +static int +zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct, + int flags) +{ + znode_t *zp, *dzp = VTOZ(dvp); + znode_t *xzp = NULL; + vnode_t *vp; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zilog_t *zilog; + uint64_t acl_obj, xattr_obj; + zfs_dirlock_t *dl; + dmu_tx_t *tx; + boolean_t may_delete_now, delete_now = FALSE; + boolean_t unlinked; + uint64_t txtype; + pathname_t *realnmp = NULL; + pathname_t realnm; + int error; + int zflg = ZEXISTS; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zilog = zfsvfs->z_log; + + if (flags & FIGNORECASE) { + zflg |= ZCILOOK; + pn_alloc(&realnm); + realnmp = &realnm; + } + +top: + /* + * Attempt to lock directory; fail if entry doesn't exist. + */ + if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, + NULL, realnmp)) { + if (realnmp) + pn_free(realnmp); + ZFS_EXIT(zfsvfs); + return (error); + } + + vp = ZTOV(zp); + + if (error = zfs_zaccess_delete(dzp, zp, cr)) { + goto out; + } + + /* + * Need to use rmdir for removing directories. + */ + if (vp->v_type == VDIR) { + error = EPERM; + goto out; + } + + vnevent_remove(vp, dvp, name, ct); + + if (realnmp) + dnlc_remove(dvp, realnmp->pn_buf); + else + dnlc_remove(dvp, name); + + mutex_enter(&vp->v_lock); + may_delete_now = vp->v_count == 1 && !vn_has_cached_data(vp); + mutex_exit(&vp->v_lock); + + /* + * We may delete the znode now, or we may put it in the unlinked set; + * it depends on whether we're the last link, and on whether there are + * other holds on the vnode. So we dmu_tx_hold() the right things to + * allow for either case. + */ + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); + dmu_tx_hold_bonus(tx, zp->z_id); + if (may_delete_now) + dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END); + + /* are there any extended attributes? */ + if ((xattr_obj = zp->z_phys->zp_xattr) != 0) { + /* XXX - do we need this if we are deleting? */ + dmu_tx_hold_bonus(tx, xattr_obj); + } + + /* are there any additional acls */ + if ((acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj) != 0 && + may_delete_now) + dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); + + /* charge as an update -- would be nice not to charge at all */ + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + + error = dmu_tx_assign(tx, zfsvfs->z_assign); + if (error) { + zfs_dirent_unlock(dl); + VN_RELE(vp); + if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + if (realnmp) + pn_free(realnmp); + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Remove the directory entry. + */ + error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked); + + if (error) { + dmu_tx_commit(tx); + goto out; + } + + if (unlinked) { + mutex_enter(&vp->v_lock); + delete_now = may_delete_now && + vp->v_count == 1 && !vn_has_cached_data(vp) && + zp->z_phys->zp_xattr == xattr_obj && + zp->z_phys->zp_acl.z_acl_extern_obj == acl_obj; + mutex_exit(&vp->v_lock); + } + + if (delete_now) { + if (zp->z_phys->zp_xattr) { + error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp); + ASSERT3U(error, ==, 0); + ASSERT3U(xzp->z_phys->zp_links, ==, 2); + dmu_buf_will_dirty(xzp->z_dbuf, tx); + mutex_enter(&xzp->z_lock); + xzp->z_unlinked = 1; + xzp->z_phys->zp_links = 0; + mutex_exit(&xzp->z_lock); + zfs_unlinked_add(xzp, tx); + zp->z_phys->zp_xattr = 0; /* probably unnecessary */ + } + mutex_enter(&zp->z_lock); + mutex_enter(&vp->v_lock); + vp->v_count--; + ASSERT3U(vp->v_count, ==, 0); + mutex_exit(&vp->v_lock); + mutex_exit(&zp->z_lock); + zfs_znode_delete(zp, tx); + } else if (unlinked) { + zfs_unlinked_add(zp, tx); + } + + txtype = TX_REMOVE; + if (flags & FIGNORECASE) + txtype |= TX_CI; + zfs_log_remove(zilog, tx, txtype, dzp, name); + + dmu_tx_commit(tx); +out: + if (realnmp) + pn_free(realnmp); + + zfs_dirent_unlock(dl); + + if (!delete_now) { + VN_RELE(vp); + } else if (xzp) { + /* this rele delayed to prevent nesting transactions */ + VN_RELE(ZTOV(xzp)); + } + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Create a new directory and insert it into dvp using the name + * provided. Return a pointer to the inserted directory. + * + * IN: dvp - vnode of directory to add subdir to. + * dirname - name of new directory. + * vap - attributes of new directory. + * cr - credentials of caller. + * ct - caller context + * vsecp - ACL to be set + * + * OUT: vpp - vnode of created directory. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * dvp - ctime|mtime updated + * vp - ctime|mtime|atime updated + */ +/*ARGSUSED*/ +static int +zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr, + caller_context_t *ct, int flags, vsecattr_t *vsecp) +{ + znode_t *zp, *dzp = VTOZ(dvp); + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zilog_t *zilog; + zfs_dirlock_t *dl; + uint64_t txtype; + dmu_tx_t *tx; + int error; + zfs_acl_t *aclp = NULL; + zfs_fuid_info_t *fuidp = NULL; + int zf = ZNEW; + + ASSERT(vap->va_type == VDIR); + + /* + * If we have an ephemeral id, ACL, or XVATTR then + * make sure file system is at proper version + */ + + if (zfsvfs->z_use_fuids == B_FALSE && + (vsecp || (vap->va_mask & AT_XVATTR) || IS_EPHEMERAL(crgetuid(cr))|| + IS_EPHEMERAL(crgetgid(cr)))) + return (EINVAL); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zilog = zfsvfs->z_log; + + if (dzp->z_phys->zp_flags & ZFS_XATTR) { + ZFS_EXIT(zfsvfs); + return (EINVAL); + } + + if (zfsvfs->z_utf8 && u8_validate(dirname, + strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (EILSEQ); + } + if (flags & FIGNORECASE) + zf |= ZCILOOK; + + if (vap->va_mask & AT_XVATTR) + if ((error = secpolicy_xvattr((xvattr_t *)vap, + crgetuid(cr), cr, vap->va_type)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * First make sure the new directory doesn't exist. + */ +top: + *vpp = NULL; + + if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf, + NULL, NULL)) { + ZFS_EXIT(zfsvfs); + return (error); + } + + if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) { + zfs_dirent_unlock(dl); + ZFS_EXIT(zfsvfs); + return (error); + } + + if (vsecp && aclp == NULL) { + error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, &aclp); + if (error) { + zfs_dirent_unlock(dl); + ZFS_EXIT(zfsvfs); + return (error); + } + } + /* + * Add a new entry to the directory. + */ + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); + if ((aclp && aclp->z_has_fuids) || IS_EPHEMERAL(crgetuid(cr)) || + IS_EPHEMERAL(crgetgid(cr))) { + if (zfsvfs->z_fuid_obj == 0) { + dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + FUID_SIZE_ESTIMATE(zfsvfs)); + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); + } else { + dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); + dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, + FUID_SIZE_ESTIMATE(zfsvfs)); + } + } + if ((dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) || aclp) + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, SPA_MAXBLOCKSIZE); + error = dmu_tx_assign(tx, zfsvfs->z_assign); + if (error) { + zfs_dirent_unlock(dl); + if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + if (aclp) + zfs_acl_free(aclp); + return (error); + } + + /* + * Create new node. + */ + zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, aclp, &fuidp); + + if (aclp) + zfs_acl_free(aclp); + + /* + * Now put new name in parent dir. + */ + (void) zfs_link_create(dl, zp, tx, ZNEW); + + *vpp = ZTOV(zp); + + txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap); + if (flags & FIGNORECASE) + txtype |= TX_CI; + zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, fuidp, vap); + + if (fuidp) + zfs_fuid_info_free(fuidp); + dmu_tx_commit(tx); + + zfs_dirent_unlock(dl); + + ZFS_EXIT(zfsvfs); + return (0); +} + +/* + * Remove a directory subdir entry. If the current working + * directory is the same as the subdir to be removed, the + * remove will fail. + * + * IN: dvp - vnode of directory to remove from. + * name - name of directory to be removed. + * cwd - vnode of current working directory. + * cr - credentials of caller. + * ct - caller context + * flags - case flags + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * dvp - ctime|mtime updated + */ +/*ARGSUSED*/ +static int +zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr, + caller_context_t *ct, int flags) +{ + znode_t *dzp = VTOZ(dvp); + znode_t *zp; + vnode_t *vp; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zilog_t *zilog; + zfs_dirlock_t *dl; + dmu_tx_t *tx; + int error; + int zflg = ZEXISTS; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zilog = zfsvfs->z_log; + + if (flags & FIGNORECASE) + zflg |= ZCILOOK; +top: + zp = NULL; + + /* + * Attempt to lock directory; fail if entry doesn't exist. + */ + if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, + NULL, NULL)) { + ZFS_EXIT(zfsvfs); + return (error); + } + + vp = ZTOV(zp); + + if (error = zfs_zaccess_delete(dzp, zp, cr)) { + goto out; + } + + if (vp->v_type != VDIR) { + error = ENOTDIR; + goto out; + } + + if (vp == cwd) { + error = EINVAL; + goto out; + } + + vnevent_rmdir(vp, dvp, name, ct); + + /* + * Grab a lock on the directory to make sure that noone is + * trying to add (or lookup) entries while we are removing it. + */ + rw_enter(&zp->z_name_lock, RW_WRITER); + + /* + * Grab a lock on the parent pointer to make sure we play well + * with the treewalk and directory rename code. + */ + rw_enter(&zp->z_parent_lock, RW_WRITER); + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); + dmu_tx_hold_bonus(tx, zp->z_id); + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + error = dmu_tx_assign(tx, zfsvfs->z_assign); + if (error) { + rw_exit(&zp->z_parent_lock); + rw_exit(&zp->z_name_lock); + zfs_dirent_unlock(dl); + VN_RELE(vp); + if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + + error = zfs_link_destroy(dl, zp, tx, zflg, NULL); + + if (error == 0) { + uint64_t txtype = TX_RMDIR; + if (flags & FIGNORECASE) + txtype |= TX_CI; + zfs_log_remove(zilog, tx, txtype, dzp, name); + } + + dmu_tx_commit(tx); + + rw_exit(&zp->z_parent_lock); + rw_exit(&zp->z_name_lock); +out: + zfs_dirent_unlock(dl); + + VN_RELE(vp); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Read as many directory entries as will fit into the provided + * buffer from the given directory cursor position (specified in + * the uio structure. + * + * IN: vp - vnode of directory to read. + * uio - structure supplying read location, range info, + * and return buffer. + * cr - credentials of caller. + * ct - caller context + * flags - case flags + * + * OUT: uio - updated offset and range, buffer filled. + * eofp - set to true if end-of-file detected. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * vp - atime updated + * + * Note that the low 4 bits of the cookie returned by zap is always zero. + * This allows us to use the low range for "special" directory entries: + * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, + * we use the offset 2 for the '.zfs' directory. + */ +/* ARGSUSED */ +static int +zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, + caller_context_t *ct, int flags) +{ + znode_t *zp = VTOZ(vp); + iovec_t *iovp; + edirent_t *eodp; + dirent64_t *odp; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + objset_t *os; + caddr_t outbuf; + size_t bufsize; + zap_cursor_t zc; + zap_attribute_t zap; + uint_t bytes_wanted; + uint64_t offset; /* must be unsigned; checks for < 1 */ + int local_eof; + int outcount; + int error; + uint8_t prefetch; + boolean_t check_sysattrs; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + /* + * If we are not given an eof variable, + * use a local one. + */ + if (eofp == NULL) + eofp = &local_eof; + + /* + * Check for valid iov_len. + */ + if (uio->uio_iov->iov_len <= 0) { + ZFS_EXIT(zfsvfs); + return (EINVAL); + } + + /* + * Quit if directory has been removed (posix) + */ + if ((*eofp = zp->z_unlinked) != 0) { + ZFS_EXIT(zfsvfs); + return (0); + } + + error = 0; + os = zfsvfs->z_os; + offset = uio->uio_loffset; + prefetch = zp->z_zn_prefetch; + + /* + * Initialize the iterator cursor. + */ + if (offset <= 3) { + /* + * Start iteration from the beginning of the directory. + */ + zap_cursor_init(&zc, os, zp->z_id); + } else { + /* + * The offset is a serialized cursor. + */ + zap_cursor_init_serialized(&zc, os, zp->z_id, offset); + } + + /* + * Get space to change directory entries into fs independent format. + */ + iovp = uio->uio_iov; + bytes_wanted = iovp->iov_len; + if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) { + bufsize = bytes_wanted; + outbuf = kmem_alloc(bufsize, KM_SLEEP); + odp = (struct dirent64 *)outbuf; + } else { + bufsize = bytes_wanted; + odp = (struct dirent64 *)iovp->iov_base; + } + eodp = (struct edirent *)odp; + + /* + * If this VFS supports system attributes; and we're looking at an + * extended attribute directory; and we care about normalization + * conflicts on this vfs; then we must check for normalization + * conflicts with the sysattr name space. + */ + check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) && + (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm && + (flags & V_RDDIR_ENTFLAGS); + + /* + * Transform to file-system independent format + */ + outcount = 0; + while (outcount < bytes_wanted) { + ino64_t objnum; + ushort_t reclen; + off64_t *next; + + /* + * Special case `.', `..', and `.zfs'. + */ + if (offset == 0) { + (void) strcpy(zap.za_name, "."); + zap.za_normalization_conflict = 0; + objnum = zp->z_id; + } else if (offset == 1) { + (void) strcpy(zap.za_name, ".."); + zap.za_normalization_conflict = 0; + objnum = zp->z_phys->zp_parent; + } else if (offset == 2 && zfs_show_ctldir(zp)) { + (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); + zap.za_normalization_conflict = 0; + objnum = ZFSCTL_INO_ROOT; + } else { + /* + * Grab next entry. + */ + if (error = zap_cursor_retrieve(&zc, &zap)) { + if ((*eofp = (error == ENOENT)) != 0) + break; + else + goto update; + } + + if (zap.za_integer_length != 8 || + zap.za_num_integers != 1) { + cmn_err(CE_WARN, "zap_readdir: bad directory " + "entry, obj = %lld, offset = %lld\n", + (u_longlong_t)zp->z_id, + (u_longlong_t)offset); + error = ENXIO; + goto update; + } + + objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); + /* + * MacOS X can extract the object type here such as: + * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer); + */ + + if (check_sysattrs && !zap.za_normalization_conflict) { + zap.za_normalization_conflict = + xattr_sysattr_casechk(zap.za_name); + } + } + + if (flags & V_RDDIR_ENTFLAGS) + reclen = EDIRENT_RECLEN(strlen(zap.za_name)); + else + reclen = DIRENT64_RECLEN(strlen(zap.za_name)); + + /* + * Will this entry fit in the buffer? + */ + if (outcount + reclen > bufsize) { + /* + * Did we manage to fit anything in the buffer? + */ + if (!outcount) { + error = EINVAL; + goto update; + } + break; + } + if (flags & V_RDDIR_ENTFLAGS) { + /* + * Add extended flag entry: + */ + eodp->ed_ino = objnum; + eodp->ed_reclen = reclen; + /* NOTE: ed_off is the offset for the *next* entry */ + next = &(eodp->ed_off); + eodp->ed_eflags = zap.za_normalization_conflict ? + ED_CASE_CONFLICT : 0; + (void) strncpy(eodp->ed_name, zap.za_name, + EDIRENT_NAMELEN(reclen)); + eodp = (edirent_t *)((intptr_t)eodp + reclen); + } else { + /* + * Add normal entry: + */ + odp->d_ino = objnum; + odp->d_reclen = reclen; + /* NOTE: d_off is the offset for the *next* entry */ + next = &(odp->d_off); + (void) strncpy(odp->d_name, zap.za_name, + DIRENT64_NAMELEN(reclen)); + odp = (dirent64_t *)((intptr_t)odp + reclen); + } + outcount += reclen; + + ASSERT(outcount <= bufsize); + + /* Prefetch znode */ + if (prefetch) + dmu_prefetch(os, objnum, 0, 0); + + /* + * Move to the next entry, fill in the previous offset. + */ + if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { + zap_cursor_advance(&zc); + offset = zap_cursor_serialize(&zc); + } else { + offset += 1; + } + *next = offset; + } + zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ + + if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) { + iovp->iov_base += outcount; + iovp->iov_len -= outcount; + uio->uio_resid -= outcount; + } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) { + /* + * Reset the pointer. + */ + offset = uio->uio_loffset; + } + +update: + zap_cursor_fini(&zc); + if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) + kmem_free(outbuf, bufsize); + + if (error == ENOENT) + error = 0; + + ZFS_ACCESSTIME_STAMP(zfsvfs, zp); + + uio->uio_loffset = offset; + ZFS_EXIT(zfsvfs); + return (error); +} + +ulong_t zfs_fsync_sync_cnt = 4; + +static int +zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + /* + * Regardless of whether this is required for standards conformance, + * this is the logical behavior when fsync() is called on a file with + * dirty pages. We use B_ASYNC since the ZIL transactions are already + * going to be pushed out as part of the zil_commit(). + */ + if (vn_has_cached_data(vp) && !(syncflag & FNODSYNC) && + (vp->v_type == VREG) && !(IS_SWAPVP(vp))) + (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0, B_ASYNC, cr, ct); + + (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id); + ZFS_EXIT(zfsvfs); + return (0); +} + + +/* + * Get the requested file attributes and place them in the provided + * vattr structure. + * + * IN: vp - vnode of file. + * vap - va_mask identifies requested attributes. + * If AT_XVATTR set, then optional attrs are requested + * flags - ATTR_NOACLCHECK (CIFS server context) + * cr - credentials of caller. + * ct - caller context + * + * OUT: vap - attribute values. + * + * RETURN: 0 (always succeeds) + */ +/* ARGSUSED */ +static int +zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, + caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + znode_phys_t *pzp; + int error = 0; + uint64_t links; + xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ + xoptattr_t *xoap = NULL; + boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + pzp = zp->z_phys; + + mutex_enter(&zp->z_lock); + + /* + * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. + * Also, if we are the owner don't bother, since owner should + * always be allowed to read basic attributes of file. + */ + if (!(pzp->zp_flags & ZFS_ACL_TRIVIAL) && + (pzp->zp_uid != crgetuid(cr))) { + if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, + skipaclchk, cr)) { + mutex_exit(&zp->z_lock); + ZFS_EXIT(zfsvfs); + return (error); + } + } + + /* + * Return all attributes. It's cheaper to provide the answer + * than to determine whether we were asked the question. + */ + + vap->va_type = vp->v_type; + vap->va_mode = pzp->zp_mode & MODEMASK; + zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); + vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev; + vap->va_nodeid = zp->z_id; + if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp)) + links = pzp->zp_links + 1; + else + links = pzp->zp_links; + vap->va_nlink = MIN(links, UINT32_MAX); /* nlink_t limit! */ + vap->va_size = pzp->zp_size; + vap->va_rdev = vp->v_rdev; + vap->va_seq = zp->z_seq; + + /* + * Add in any requested optional attributes and the create time. + * Also set the corresponding bits in the returned attribute bitmap. + */ + if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) { + if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { + xoap->xoa_archive = + ((pzp->zp_flags & ZFS_ARCHIVE) != 0); + XVA_SET_RTN(xvap, XAT_ARCHIVE); + } + + if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { + xoap->xoa_readonly = + ((pzp->zp_flags & ZFS_READONLY) != 0); + XVA_SET_RTN(xvap, XAT_READONLY); + } + + if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { + xoap->xoa_system = + ((pzp->zp_flags & ZFS_SYSTEM) != 0); + XVA_SET_RTN(xvap, XAT_SYSTEM); + } + + if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { + xoap->xoa_hidden = + ((pzp->zp_flags & ZFS_HIDDEN) != 0); + XVA_SET_RTN(xvap, XAT_HIDDEN); + } + + if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { + xoap->xoa_nounlink = + ((pzp->zp_flags & ZFS_NOUNLINK) != 0); + XVA_SET_RTN(xvap, XAT_NOUNLINK); + } + + if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { + xoap->xoa_immutable = + ((pzp->zp_flags & ZFS_IMMUTABLE) != 0); + XVA_SET_RTN(xvap, XAT_IMMUTABLE); + } + + if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { + xoap->xoa_appendonly = + ((pzp->zp_flags & ZFS_APPENDONLY) != 0); + XVA_SET_RTN(xvap, XAT_APPENDONLY); + } + + if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { + xoap->xoa_nodump = + ((pzp->zp_flags & ZFS_NODUMP) != 0); + XVA_SET_RTN(xvap, XAT_NODUMP); + } + + if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { + xoap->xoa_opaque = + ((pzp->zp_flags & ZFS_OPAQUE) != 0); + XVA_SET_RTN(xvap, XAT_OPAQUE); + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { + xoap->xoa_av_quarantined = + ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0); + XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { + xoap->xoa_av_modified = + ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0); + XVA_SET_RTN(xvap, XAT_AV_MODIFIED); + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) && + vp->v_type == VREG && + (pzp->zp_flags & ZFS_BONUS_SCANSTAMP)) { + size_t len; + dmu_object_info_t doi; + + /* + * Only VREG files have anti-virus scanstamps, so we + * won't conflict with symlinks in the bonus buffer. + */ + dmu_object_info_from_db(zp->z_dbuf, &doi); + len = sizeof (xoap->xoa_av_scanstamp) + + sizeof (znode_phys_t); + if (len <= doi.doi_bonus_size) { + /* + * pzp points to the start of the + * znode_phys_t. pzp + 1 points to the + * first byte after the znode_phys_t. + */ + (void) memcpy(xoap->xoa_av_scanstamp, + pzp + 1, + sizeof (xoap->xoa_av_scanstamp)); + XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { + ZFS_TIME_DECODE(&xoap->xoa_createtime, pzp->zp_crtime); + XVA_SET_RTN(xvap, XAT_CREATETIME); + } + } + + ZFS_TIME_DECODE(&vap->va_atime, pzp->zp_atime); + ZFS_TIME_DECODE(&vap->va_mtime, pzp->zp_mtime); + ZFS_TIME_DECODE(&vap->va_ctime, pzp->zp_ctime); + + mutex_exit(&zp->z_lock); + + dmu_object_size_from_db(zp->z_dbuf, &vap->va_blksize, &vap->va_nblocks); + + if (zp->z_blksz == 0) { + /* + * Block size hasn't been set; suggest maximal I/O transfers. + */ + vap->va_blksize = zfsvfs->z_max_blksz; + } + + ZFS_EXIT(zfsvfs); + return (0); +} + +/* + * Set the file attributes to the values contained in the + * vattr structure. + * + * IN: vp - vnode of file to be modified. + * vap - new attribute values. + * If AT_XVATTR set, then optional attrs are being set + * flags - ATTR_UTIME set if non-default time values provided. + * - ATTR_NOACLCHECK (CIFS context only). + * cr - credentials of caller. + * ct - caller context + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * vp - ctime updated, mtime updated if size changed. + */ +/* ARGSUSED */ +static int +zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, + caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + znode_phys_t *pzp; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zilog_t *zilog; + dmu_tx_t *tx; + vattr_t oldva; + uint_t mask = vap->va_mask; + uint_t saved_mask; + int trim_mask = 0; + uint64_t new_mode; + znode_t *attrzp; + int need_policy = FALSE; + int err; + zfs_fuid_info_t *fuidp = NULL; + xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ + xoptattr_t *xoap; + zfs_acl_t *aclp = NULL; + boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; + + if (mask == 0) + return (0); + + if (mask & AT_NOSET) + return (EINVAL); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + pzp = zp->z_phys; + zilog = zfsvfs->z_log; + + /* + * Make sure that if we have ephemeral uid/gid or xvattr specified + * that file system is at proper version level + */ + + if (zfsvfs->z_use_fuids == B_FALSE && + (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) || + ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) || + (mask & AT_XVATTR))) { + ZFS_EXIT(zfsvfs); + return (EINVAL); + } + + if (mask & AT_SIZE && vp->v_type == VDIR) { + ZFS_EXIT(zfsvfs); + return (EISDIR); + } + + if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) { + ZFS_EXIT(zfsvfs); + return (EINVAL); + } + + /* + * If this is an xvattr_t, then get a pointer to the structure of + * optional attributes. If this is NULL, then we have a vattr_t. + */ + xoap = xva_getxoptattr(xvap); + + /* + * Immutable files can only alter immutable bit and atime + */ + if ((pzp->zp_flags & ZFS_IMMUTABLE) && + ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) || + ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { + ZFS_EXIT(zfsvfs); + return (EPERM); + } + + if ((mask & AT_SIZE) && (pzp->zp_flags & ZFS_READONLY)) { + ZFS_EXIT(zfsvfs); + return (EPERM); + } + + /* + * Verify timestamps doesn't overflow 32 bits. + * ZFS can handle large timestamps, but 32bit syscalls can't + * handle times greater than 2039. This check should be removed + * once large timestamps are fully supported. + */ + if (mask & (AT_ATIME | AT_MTIME)) { + if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || + ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { + ZFS_EXIT(zfsvfs); + return (EOVERFLOW); + } + } + +top: + attrzp = NULL; + + if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { + ZFS_EXIT(zfsvfs); + return (EROFS); + } + + /* + * First validate permissions + */ + + if (mask & AT_SIZE) { + err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr); + if (err) { + ZFS_EXIT(zfsvfs); + return (err); + } + /* + * XXX - Note, we are not providing any open + * mode flags here (like FNDELAY), so we may + * block if there are locks present... this + * should be addressed in openat(). + */ + do { + err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); + /* NB: we already did dmu_tx_wait() if necessary */ + } while (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT); + if (err) { + ZFS_EXIT(zfsvfs); + return (err); + } + } + + if (mask & (AT_ATIME|AT_MTIME) || + ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || + XVA_ISSET_REQ(xvap, XAT_READONLY) || + XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || + XVA_ISSET_REQ(xvap, XAT_CREATETIME) || + XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) + need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, + skipaclchk, cr); + + if (mask & (AT_UID|AT_GID)) { + int idmask = (mask & (AT_UID|AT_GID)); + int take_owner; + int take_group; + + /* + * NOTE: even if a new mode is being set, + * we may clear S_ISUID/S_ISGID bits. + */ + + if (!(mask & AT_MODE)) + vap->va_mode = pzp->zp_mode; + + /* + * Take ownership or chgrp to group we are a member of + */ + + take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr)); + take_group = (mask & AT_GID) && + zfs_groupmember(zfsvfs, vap->va_gid, cr); + + /* + * If both AT_UID and AT_GID are set then take_owner and + * take_group must both be set in order to allow taking + * ownership. + * + * Otherwise, send the check through secpolicy_vnode_setattr() + * + */ + + if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) || + ((idmask == AT_UID) && take_owner) || + ((idmask == AT_GID) && take_group)) { + if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, + skipaclchk, cr) == 0) { + /* + * Remove setuid/setgid for non-privileged users + */ + secpolicy_setid_clear(vap, cr); + trim_mask = (mask & (AT_UID|AT_GID)); + } else { + need_policy = TRUE; + } + } else { + need_policy = TRUE; + } + } + + mutex_enter(&zp->z_lock); + oldva.va_mode = pzp->zp_mode; + zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); + if (mask & AT_XVATTR) { + if ((need_policy == FALSE) && + (XVA_ISSET_REQ(xvap, XAT_APPENDONLY) && + xoap->xoa_appendonly != + ((pzp->zp_flags & ZFS_APPENDONLY) != 0)) || + (XVA_ISSET_REQ(xvap, XAT_NOUNLINK) && + xoap->xoa_nounlink != + ((pzp->zp_flags & ZFS_NOUNLINK) != 0)) || + (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE) && + xoap->xoa_immutable != + ((pzp->zp_flags & ZFS_IMMUTABLE) != 0)) || + (XVA_ISSET_REQ(xvap, XAT_NODUMP) && + xoap->xoa_nodump != + ((pzp->zp_flags & ZFS_NODUMP) != 0)) || + (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED) && + xoap->xoa_av_modified != + ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0)) || + ((XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED) && + ((vp->v_type != VREG && xoap->xoa_av_quarantined) || + xoap->xoa_av_quarantined != + ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0)))) || + (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) || + (XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { + need_policy = TRUE; + } + } + + mutex_exit(&zp->z_lock); + + if (mask & AT_MODE) { + if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { + err = secpolicy_setid_setsticky_clear(vp, vap, + &oldva, cr); + if (err) { + ZFS_EXIT(zfsvfs); + return (err); + } + trim_mask |= AT_MODE; + } else { + need_policy = TRUE; + } + } + + if (need_policy) { + /* + * If trim_mask is set then take ownership + * has been granted or write_acl is present and user + * has the ability to modify mode. In that case remove + * UID|GID and or MODE from mask so that + * secpolicy_vnode_setattr() doesn't revoke it. + */ + + if (trim_mask) { + saved_mask = vap->va_mask; + vap->va_mask &= ~trim_mask; + } + err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags, + (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); + if (err) { + ZFS_EXIT(zfsvfs); + return (err); + } + + if (trim_mask) + vap->va_mask |= saved_mask; + } + + /* + * secpolicy_vnode_setattr, or take ownership may have + * changed va_mask + */ + mask = vap->va_mask; + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_bonus(tx, zp->z_id); + if (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) || + ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid))) { + if (zfsvfs->z_fuid_obj == 0) { + dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + FUID_SIZE_ESTIMATE(zfsvfs)); + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); + } else { + dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); + dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, + FUID_SIZE_ESTIMATE(zfsvfs)); + } + } + + if (mask & AT_MODE) { + uint64_t pmode = pzp->zp_mode; + + new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); + + if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) { + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (err); + } + if (pzp->zp_acl.z_acl_extern_obj) { + /* Are we upgrading ACL from old V0 format to new V1 */ + if (zfsvfs->z_version <= ZPL_VERSION_FUID && + pzp->zp_acl.z_acl_version == + ZFS_ACL_VERSION_INITIAL) { + dmu_tx_hold_free(tx, + pzp->zp_acl.z_acl_extern_obj, 0, + DMU_OBJECT_END); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, aclp->z_acl_bytes); + } else { + dmu_tx_hold_write(tx, + pzp->zp_acl.z_acl_extern_obj, 0, + aclp->z_acl_bytes); + } + } else if (aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, aclp->z_acl_bytes); + } + } + + if ((mask & (AT_UID | AT_GID)) && pzp->zp_xattr != 0) { + err = zfs_zget(zp->z_zfsvfs, pzp->zp_xattr, &attrzp); + if (err) { + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + if (aclp) + zfs_acl_free(aclp); + return (err); + } + dmu_tx_hold_bonus(tx, attrzp->z_id); + } + + err = dmu_tx_assign(tx, zfsvfs->z_assign); + if (err) { + if (attrzp) + VN_RELE(ZTOV(attrzp)); + + if (aclp) { + zfs_acl_free(aclp); + aclp = NULL; + } + + if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (err); + } + + dmu_buf_will_dirty(zp->z_dbuf, tx); + + /* + * Set each attribute requested. + * We group settings according to the locks they need to acquire. + * + * Note: you cannot set ctime directly, although it will be + * updated as a side-effect of calling this function. + */ + + mutex_enter(&zp->z_lock); + + if (mask & AT_MODE) { + mutex_enter(&zp->z_acl_lock); + zp->z_phys->zp_mode = new_mode; + err = zfs_aclset_common(zp, aclp, cr, &fuidp, tx); + ASSERT3U(err, ==, 0); + mutex_exit(&zp->z_acl_lock); + } + + if (attrzp) + mutex_enter(&attrzp->z_lock); + + if (mask & AT_UID) { + pzp->zp_uid = zfs_fuid_create(zfsvfs, + vap->va_uid, cr, ZFS_OWNER, tx, &fuidp); + if (attrzp) { + attrzp->z_phys->zp_uid = zfs_fuid_create(zfsvfs, + vap->va_uid, cr, ZFS_OWNER, tx, &fuidp); + } + } + + if (mask & AT_GID) { + pzp->zp_gid = zfs_fuid_create(zfsvfs, vap->va_gid, + cr, ZFS_GROUP, tx, &fuidp); + if (attrzp) + attrzp->z_phys->zp_gid = zfs_fuid_create(zfsvfs, + vap->va_gid, cr, ZFS_GROUP, tx, &fuidp); + } + + if (aclp) + zfs_acl_free(aclp); + + if (attrzp) + mutex_exit(&attrzp->z_lock); + + if (mask & AT_ATIME) + ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime); + + if (mask & AT_MTIME) + ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime); + + if (mask & AT_SIZE) + zfs_time_stamper_locked(zp, CONTENT_MODIFIED, tx); + else if (mask != 0) + zfs_time_stamper_locked(zp, STATE_CHANGED, tx); + /* + * Do this after setting timestamps to prevent timestamp + * update from toggling bit + */ + + if (xoap && (mask & AT_XVATTR)) { + if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { + size_t len; + dmu_object_info_t doi; + + ASSERT(vp->v_type == VREG); + + /* Grow the bonus buffer if necessary. */ + dmu_object_info_from_db(zp->z_dbuf, &doi); + len = sizeof (xoap->xoa_av_scanstamp) + + sizeof (znode_phys_t); + if (len > doi.doi_bonus_size) + VERIFY(dmu_set_bonus(zp->z_dbuf, len, tx) == 0); + } + zfs_xvattr_set(zp, xvap); + } + + if (mask != 0) + zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); + + if (fuidp) + zfs_fuid_info_free(fuidp); + mutex_exit(&zp->z_lock); + + if (attrzp) + VN_RELE(ZTOV(attrzp)); + + dmu_tx_commit(tx); + + ZFS_EXIT(zfsvfs); + return (err); +} + +typedef struct zfs_zlock { + krwlock_t *zl_rwlock; /* lock we acquired */ + znode_t *zl_znode; /* znode we held */ + struct zfs_zlock *zl_next; /* next in list */ +} zfs_zlock_t; + +/* + * Drop locks and release vnodes that were held by zfs_rename_lock(). + */ +static void +zfs_rename_unlock(zfs_zlock_t **zlpp) +{ + zfs_zlock_t *zl; + + while ((zl = *zlpp) != NULL) { + if (zl->zl_znode != NULL) + VN_RELE(ZTOV(zl->zl_znode)); + rw_exit(zl->zl_rwlock); + *zlpp = zl->zl_next; + kmem_free(zl, sizeof (*zl)); + } +} + +/* + * Search back through the directory tree, using the ".." entries. + * Lock each directory in the chain to prevent concurrent renames. + * Fail any attempt to move a directory into one of its own descendants. + * XXX - z_parent_lock can overlap with map or grow locks + */ +static int +zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) +{ + zfs_zlock_t *zl; + znode_t *zp = tdzp; + uint64_t rootid = zp->z_zfsvfs->z_root; + uint64_t *oidp = &zp->z_id; + krwlock_t *rwlp = &szp->z_parent_lock; + krw_t rw = RW_WRITER; + + /* + * First pass write-locks szp and compares to zp->z_id. + * Later passes read-lock zp and compare to zp->z_parent. + */ + do { + if (!rw_tryenter(rwlp, rw)) { + /* + * Another thread is renaming in this path. + * Note that if we are a WRITER, we don't have any + * parent_locks held yet. + */ + if (rw == RW_READER && zp->z_id > szp->z_id) { + /* + * Drop our locks and restart + */ + zfs_rename_unlock(&zl); + *zlpp = NULL; + zp = tdzp; + oidp = &zp->z_id; + rwlp = &szp->z_parent_lock; + rw = RW_WRITER; + continue; + } else { + /* + * Wait for other thread to drop its locks + */ + rw_enter(rwlp, rw); + } + } + + zl = kmem_alloc(sizeof (*zl), KM_SLEEP); + zl->zl_rwlock = rwlp; + zl->zl_znode = NULL; + zl->zl_next = *zlpp; + *zlpp = zl; + + if (*oidp == szp->z_id) /* We're a descendant of szp */ + return (EINVAL); + + if (*oidp == rootid) /* We've hit the top */ + return (0); + + if (rw == RW_READER) { /* i.e. not the first pass */ + int error = zfs_zget(zp->z_zfsvfs, *oidp, &zp); + if (error) + return (error); + zl->zl_znode = zp; + } + oidp = &zp->z_phys->zp_parent; + rwlp = &zp->z_parent_lock; + rw = RW_READER; + + } while (zp->z_id != sdzp->z_id); + + return (0); +} + +/* + * Move an entry from the provided source directory to the target + * directory. Change the entry name as indicated. + * + * IN: sdvp - Source directory containing the "old entry". + * snm - Old entry name. + * tdvp - Target directory to contain the "new entry". + * tnm - New entry name. + * cr - credentials of caller. + * ct - caller context + * flags - case flags + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * sdvp,tdvp - ctime|mtime updated + */ +/*ARGSUSED*/ +static int +zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr, + caller_context_t *ct, int flags) +{ + znode_t *tdzp, *szp, *tzp; + znode_t *sdzp = VTOZ(sdvp); + zfsvfs_t *zfsvfs = sdzp->z_zfsvfs; + zilog_t *zilog; + vnode_t *realvp; + zfs_dirlock_t *sdl, *tdl; + dmu_tx_t *tx; + zfs_zlock_t *zl; + int cmp, serr, terr; + int error = 0; + int zflg = 0; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(sdzp); + zilog = zfsvfs->z_log; + + /* + * Make sure we have the real vp for the target directory. + */ + if (VOP_REALVP(tdvp, &realvp, ct) == 0) + tdvp = realvp; + + if (tdvp->v_vfsp != sdvp->v_vfsp) { + ZFS_EXIT(zfsvfs); + return (EXDEV); + } + + tdzp = VTOZ(tdvp); + ZFS_VERIFY_ZP(tdzp); + if (zfsvfs->z_utf8 && u8_validate(tnm, + strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (EILSEQ); + } + + if (flags & FIGNORECASE) + zflg |= ZCILOOK; + +top: + szp = NULL; + tzp = NULL; + zl = NULL; + + /* + * This is to prevent the creation of links into attribute space + * by renaming a linked file into/outof an attribute directory. + * See the comment in zfs_link() for why this is considered bad. + */ + if ((tdzp->z_phys->zp_flags & ZFS_XATTR) != + (sdzp->z_phys->zp_flags & ZFS_XATTR)) { + ZFS_EXIT(zfsvfs); + return (EINVAL); + } + + /* + * Lock source and target directory entries. To prevent deadlock, + * a lock ordering must be defined. We lock the directory with + * the smallest object id first, or if it's a tie, the one with + * the lexically first name. + */ + if (sdzp->z_id < tdzp->z_id) { + cmp = -1; + } else if (sdzp->z_id > tdzp->z_id) { + cmp = 1; + } else { + /* + * First compare the two name arguments without + * considering any case folding. + */ + int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER); + + cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error); + ASSERT(error == 0 || !zfsvfs->z_utf8); + if (cmp == 0) { + /* + * POSIX: "If the old argument and the new argument + * both refer to links to the same existing file, + * the rename() function shall return successfully + * and perform no other action." + */ + ZFS_EXIT(zfsvfs); + return (0); + } + /* + * If the file system is case-folding, then we may + * have some more checking to do. A case-folding file + * system is either supporting mixed case sensitivity + * access or is completely case-insensitive. Note + * that the file system is always case preserving. + * + * In mixed sensitivity mode case sensitive behavior + * is the default. FIGNORECASE must be used to + * explicitly request case insensitive behavior. + * + * If the source and target names provided differ only + * by case (e.g., a request to rename 'tim' to 'Tim'), + * we will treat this as a special case in the + * case-insensitive mode: as long as the source name + * is an exact match, we will allow this to proceed as + * a name-change request. + */ + if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE || + (zfsvfs->z_case == ZFS_CASE_MIXED && + flags & FIGNORECASE)) && + u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST, + &error) == 0) { + /* + * case preserving rename request, require exact + * name matches + */ + zflg |= ZCIEXACT; + zflg &= ~ZCILOOK; + } + } + + if (cmp < 0) { + serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, + ZEXISTS | zflg, NULL, NULL); + terr = zfs_dirent_lock(&tdl, + tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL); + } else { + terr = zfs_dirent_lock(&tdl, + tdzp, tnm, &tzp, zflg, NULL, NULL); + serr = zfs_dirent_lock(&sdl, + sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg, + NULL, NULL); + } + + if (serr) { + /* + * Source entry invalid or not there. + */ + if (!terr) { + zfs_dirent_unlock(tdl); + if (tzp) + VN_RELE(ZTOV(tzp)); + } + if (strcmp(snm, "..") == 0) + serr = EINVAL; + ZFS_EXIT(zfsvfs); + return (serr); + } + if (terr) { + zfs_dirent_unlock(sdl); + VN_RELE(ZTOV(szp)); + if (strcmp(tnm, "..") == 0) + terr = EINVAL; + ZFS_EXIT(zfsvfs); + return (terr); + } + + /* + * Must have write access at the source to remove the old entry + * and write access at the target to create the new entry. + * Note that if target and source are the same, this can be + * done in a single check. + */ + + if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)) + goto out; + + if (ZTOV(szp)->v_type == VDIR) { + /* + * Check to make sure rename is valid. + * Can't do a move like this: /usr/a/b to /usr/a/b/c/d + */ + if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl)) + goto out; + } + + /* + * Does target exist? + */ + if (tzp) { + /* + * Source and target must be the same type. + */ + if (ZTOV(szp)->v_type == VDIR) { + if (ZTOV(tzp)->v_type != VDIR) { + error = ENOTDIR; + goto out; + } + } else { + if (ZTOV(tzp)->v_type == VDIR) { + error = EISDIR; + goto out; + } + } + /* + * POSIX dictates that when the source and target + * entries refer to the same file object, rename + * must do nothing and exit without error. + */ + if (szp->z_id == tzp->z_id) { + error = 0; + goto out; + } + } + + vnevent_rename_src(ZTOV(szp), sdvp, snm, ct); + if (tzp) + vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct); + + /* + * notify the target directory if it is not the same + * as source directory. + */ + if (tdvp != sdvp) { + vnevent_rename_dest_dir(tdvp, ct); + } + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_bonus(tx, szp->z_id); /* nlink changes */ + dmu_tx_hold_bonus(tx, sdzp->z_id); /* nlink changes */ + dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); + dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); + if (sdzp != tdzp) + dmu_tx_hold_bonus(tx, tdzp->z_id); /* nlink changes */ + if (tzp) + dmu_tx_hold_bonus(tx, tzp->z_id); /* parent changes */ + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + error = dmu_tx_assign(tx, zfsvfs->z_assign); + if (error) { + if (zl != NULL) + zfs_rename_unlock(&zl); + zfs_dirent_unlock(sdl); + zfs_dirent_unlock(tdl); + VN_RELE(ZTOV(szp)); + if (tzp) + VN_RELE(ZTOV(tzp)); + if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + + if (tzp) /* Attempt to remove the existing target */ + error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL); + + if (error == 0) { + error = zfs_link_create(tdl, szp, tx, ZRENAMING); + if (error == 0) { + szp->z_phys->zp_flags |= ZFS_AV_MODIFIED; + + error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); + ASSERT(error == 0); + + zfs_log_rename(zilog, tx, + TX_RENAME | (flags & FIGNORECASE ? TX_CI : 0), + sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp); + } + } + + dmu_tx_commit(tx); +out: + if (zl != NULL) + zfs_rename_unlock(&zl); + + zfs_dirent_unlock(sdl); + zfs_dirent_unlock(tdl); + + VN_RELE(ZTOV(szp)); + if (tzp) + VN_RELE(ZTOV(tzp)); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Insert the indicated symbolic reference entry into the directory. + * + * IN: dvp - Directory to contain new symbolic link. + * link - Name for new symlink entry. + * vap - Attributes of new entry. + * target - Target path of new symlink. + * cr - credentials of caller. + * ct - caller context + * flags - case flags + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * dvp - ctime|mtime updated + */ +/*ARGSUSED*/ +static int +zfs_symlink(vnode_t *dvp, char *name, vattr_t *vap, char *link, cred_t *cr, + caller_context_t *ct, int flags) +{ + znode_t *zp, *dzp = VTOZ(dvp); + zfs_dirlock_t *dl; + dmu_tx_t *tx; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zilog_t *zilog; + int len = strlen(link); + int error; + int zflg = ZNEW; + zfs_fuid_info_t *fuidp = NULL; + + ASSERT(vap->va_type == VLNK); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zilog = zfsvfs->z_log; + + if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), + NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (EILSEQ); + } + if (flags & FIGNORECASE) + zflg |= ZCILOOK; +top: + if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { + ZFS_EXIT(zfsvfs); + return (error); + } + + if (len > MAXPATHLEN) { + ZFS_EXIT(zfsvfs); + return (ENAMETOOLONG); + } + + /* + * Attempt to lock directory; fail if entry already exists. + */ + error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); + if (error) { + ZFS_EXIT(zfsvfs); + return (error); + } + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); + dmu_tx_hold_bonus(tx, dzp->z_id); + dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); + if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE); + if (IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr))) { + if (zfsvfs->z_fuid_obj == 0) { + dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + FUID_SIZE_ESTIMATE(zfsvfs)); + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); + } else { + dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); + dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, + FUID_SIZE_ESTIMATE(zfsvfs)); + } + } + error = dmu_tx_assign(tx, zfsvfs->z_assign); + if (error) { + zfs_dirent_unlock(dl); + if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + + dmu_buf_will_dirty(dzp->z_dbuf, tx); + + /* + * Create a new object for the symlink. + * Put the link content into bonus buffer if it will fit; + * otherwise, store it just like any other file data. + */ + if (sizeof (znode_phys_t) + len <= dmu_bonus_max()) { + zfs_mknode(dzp, vap, tx, cr, 0, &zp, len, NULL, &fuidp); + if (len != 0) + bcopy(link, zp->z_phys + 1, len); + } else { + dmu_buf_t *dbp; + + zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, NULL, &fuidp); + /* + * Nothing can access the znode yet so no locking needed + * for growing the znode's blocksize. + */ + zfs_grow_blocksize(zp, len, tx); + + VERIFY(0 == dmu_buf_hold(zfsvfs->z_os, + zp->z_id, 0, FTAG, &dbp)); + dmu_buf_will_dirty(dbp, tx); + + ASSERT3U(len, <=, dbp->db_size); + bcopy(link, dbp->db_data, len); + dmu_buf_rele(dbp, FTAG); + } + zp->z_phys->zp_size = len; + + /* + * Insert the new object into the directory. + */ + (void) zfs_link_create(dl, zp, tx, ZNEW); +out: + if (error == 0) { + uint64_t txtype = TX_SYMLINK; + if (flags & FIGNORECASE) + txtype |= TX_CI; + zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); + } + if (fuidp) + zfs_fuid_info_free(fuidp); + + dmu_tx_commit(tx); + + zfs_dirent_unlock(dl); + + VN_RELE(ZTOV(zp)); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Return, in the buffer contained in the provided uio structure, + * the symbolic path referred to by vp. + * + * IN: vp - vnode of symbolic link. + * uoip - structure to contain the link path. + * cr - credentials of caller. + * ct - caller context + * + * OUT: uio - structure to contain the link path. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * vp - atime updated + */ +/* ARGSUSED */ +static int +zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + size_t bufsz; + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + bufsz = (size_t)zp->z_phys->zp_size; + if (bufsz + sizeof (znode_phys_t) <= zp->z_dbuf->db_size) { + error = uiomove(zp->z_phys + 1, + MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); + } else { + dmu_buf_t *dbp; + error = dmu_buf_hold(zfsvfs->z_os, zp->z_id, 0, FTAG, &dbp); + if (error) { + ZFS_EXIT(zfsvfs); + return (error); + } + error = uiomove(dbp->db_data, + MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); + dmu_buf_rele(dbp, FTAG); + } + + ZFS_ACCESSTIME_STAMP(zfsvfs, zp); + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Insert a new entry into directory tdvp referencing svp. + * + * IN: tdvp - Directory to contain new entry. + * svp - vnode of new entry. + * name - name of new entry. + * cr - credentials of caller. + * ct - caller context + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * tdvp - ctime|mtime updated + * svp - ctime updated + */ +/* ARGSUSED */ +static int +zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr, + caller_context_t *ct, int flags) +{ + znode_t *dzp = VTOZ(tdvp); + znode_t *tzp, *szp; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zilog_t *zilog; + zfs_dirlock_t *dl; + dmu_tx_t *tx; + vnode_t *realvp; + int error; + int zf = ZNEW; + uid_t owner; + + ASSERT(tdvp->v_type == VDIR); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zilog = zfsvfs->z_log; + + if (VOP_REALVP(svp, &realvp, ct) == 0) + svp = realvp; + + if (svp->v_vfsp != tdvp->v_vfsp) { + ZFS_EXIT(zfsvfs); + return (EXDEV); + } + szp = VTOZ(svp); + ZFS_VERIFY_ZP(szp); + + if (zfsvfs->z_utf8 && u8_validate(name, + strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (EILSEQ); + } + if (flags & FIGNORECASE) + zf |= ZCILOOK; + +top: + /* + * We do not support links between attributes and non-attributes + * because of the potential security risk of creating links + * into "normal" file space in order to circumvent restrictions + * imposed in attribute space. + */ + if ((szp->z_phys->zp_flags & ZFS_XATTR) != + (dzp->z_phys->zp_flags & ZFS_XATTR)) { + ZFS_EXIT(zfsvfs); + return (EINVAL); + } + + /* + * POSIX dictates that we return EPERM here. + * Better choices include ENOTSUP or EISDIR. + */ + if (svp->v_type == VDIR) { + ZFS_EXIT(zfsvfs); + return (EPERM); + } + + owner = zfs_fuid_map_id(zfsvfs, szp->z_phys->zp_uid, cr, ZFS_OWNER); + if (owner != crgetuid(cr) && + secpolicy_basic_link(cr) != 0) { + ZFS_EXIT(zfsvfs); + return (EPERM); + } + + if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Attempt to lock directory; fail if entry already exists. + */ + error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL); + if (error) { + ZFS_EXIT(zfsvfs); + return (error); + } + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_bonus(tx, szp->z_id); + dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); + error = dmu_tx_assign(tx, zfsvfs->z_assign); + if (error) { + zfs_dirent_unlock(dl); + if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + + error = zfs_link_create(dl, szp, tx, 0); + + if (error == 0) { + uint64_t txtype = TX_LINK; + if (flags & FIGNORECASE) + txtype |= TX_CI; + zfs_log_link(zilog, tx, txtype, dzp, szp, name); + } + + dmu_tx_commit(tx); + + zfs_dirent_unlock(dl); + + if (error == 0) { + vnevent_link(svp, ct); + } + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * zfs_null_putapage() is used when the file system has been force + * unmounted. It just drops the pages. + */ +/* ARGSUSED */ +static int +zfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, + size_t *lenp, int flags, cred_t *cr) +{ + pvn_write_done(pp, B_INVAL|B_FORCE|B_ERROR); + return (0); +} + +/* + * Push a page out to disk, klustering if possible. + * + * IN: vp - file to push page to. + * pp - page to push. + * flags - additional flags. + * cr - credentials of caller. + * + * OUT: offp - start of range pushed. + * lenp - len of range pushed. + * + * RETURN: 0 if success + * error code if failure + * + * NOTE: callers must have locked the page to be pushed. On + * exit, the page (and all other pages in the kluster) must be + * unlocked. + */ +/* ARGSUSED */ +static int +zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, + size_t *lenp, int flags, cred_t *cr) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zilog_t *zilog = zfsvfs->z_log; + dmu_tx_t *tx; + rl_t *rl; + u_offset_t off, koff; + size_t len, klen; + uint64_t filesz; + int err; + + filesz = zp->z_phys->zp_size; + off = pp->p_offset; + len = PAGESIZE; + /* + * If our blocksize is bigger than the page size, try to kluster + * muiltiple pages so that we write a full block (thus avoiding + * a read-modify-write). + */ + if (off < filesz && zp->z_blksz > PAGESIZE) { + if (!ISP2(zp->z_blksz)) { + /* Only one block in the file. */ + klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE); + koff = 0; + } else { + klen = zp->z_blksz; + koff = P2ALIGN(off, (u_offset_t)klen); + } + ASSERT(koff <= filesz); + if (koff + klen > filesz) + klen = P2ROUNDUP(filesz - koff, (uint64_t)PAGESIZE); + pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags); + } + ASSERT3U(btop(len), ==, btopr(len)); +top: + rl = zfs_range_lock(zp, off, len, RL_WRITER); + /* + * Can't push pages past end-of-file. + */ + filesz = zp->z_phys->zp_size; + if (off >= filesz) { + /* ignore all pages */ + err = 0; + goto out; + } else if (off + len > filesz) { + int npages = btopr(filesz - off); + page_t *trunc; + + page_list_break(&pp, &trunc, npages); + /* ignore pages past end of file */ + if (trunc) + pvn_write_done(trunc, flags); + len = filesz - off; + } + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_write(tx, zp->z_id, off, len); + dmu_tx_hold_bonus(tx, zp->z_id); + err = dmu_tx_assign(tx, zfsvfs->z_assign); + if (err != 0) { + if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + zfs_range_unlock(rl); + dmu_tx_wait(tx); + dmu_tx_abort(tx); + err = 0; + goto top; + } + dmu_tx_abort(tx); + goto out; + } + + if (zp->z_blksz <= PAGESIZE) { + caddr_t va = ppmapin(pp, PROT_READ, (caddr_t)-1); + ASSERT3U(len, <=, PAGESIZE); + dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx); + ppmapout(va); + } else { + err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx); + } + + if (err == 0) { + zfs_time_stamper(zp, CONTENT_MODIFIED, tx); + zfs_log_write(zilog, tx, TX_WRITE, zp, off, len, 0); + dmu_tx_commit(tx); + } + +out: + zfs_range_unlock(rl); + pvn_write_done(pp, (err ? B_ERROR : 0) | flags); + if (offp) + *offp = off; + if (lenp) + *lenp = len; + + return (err); +} + +/* + * Copy the portion of the file indicated from pages into the file. + * The pages are stored in a page list attached to the files vnode. + * + * IN: vp - vnode of file to push page data to. + * off - position in file to put data. + * len - amount of data to write. + * flags - flags to control the operation. + * cr - credentials of caller. + * ct - caller context. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * vp - ctime|mtime updated + */ +/*ARGSUSED*/ +static int +zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr, + caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + page_t *pp; + size_t io_len; + u_offset_t io_off; + uint64_t filesz; + int error = 0; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if (len == 0) { + /* + * Search the entire vp list for pages >= off. + */ + error = pvn_vplist_dirty(vp, (u_offset_t)off, zfs_putapage, + flags, cr); + goto out; + } + + filesz = zp->z_phys->zp_size; /* get consistent copy of zp_size */ + if (off > filesz) { + /* past end of file */ + ZFS_EXIT(zfsvfs); + return (0); + } + + len = MIN(len, filesz - off); + + for (io_off = off; io_off < off + len; io_off += io_len) { + if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) { + pp = page_lookup(vp, io_off, + (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED); + } else { + pp = page_lookup_nowait(vp, io_off, + (flags & B_FREE) ? SE_EXCL : SE_SHARED); + } + + if (pp != NULL && pvn_getdirty(pp, flags)) { + int err; + + /* + * Found a dirty page to push + */ + err = zfs_putapage(vp, pp, &io_off, &io_len, flags, cr); + if (err) + error = err; + } else { + io_len = PAGESIZE; + } + } +out: + if ((flags & B_ASYNC) == 0) + zil_commit(zfsvfs->z_log, UINT64_MAX, zp->z_id); + ZFS_EXIT(zfsvfs); + return (error); +} + +/*ARGSUSED*/ +void +zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error; + + rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); + if (zp->z_dbuf == NULL) { + /* + * The fs has been unmounted, or we did a + * suspend/resume and this file no longer exists. + */ + if (vn_has_cached_data(vp)) { + (void) pvn_vplist_dirty(vp, 0, zfs_null_putapage, + B_INVAL, cr); + } + + mutex_enter(&zp->z_lock); + vp->v_count = 0; /* count arrives as 1 */ + mutex_exit(&zp->z_lock); + rw_exit(&zfsvfs->z_teardown_inactive_lock); + zfs_znode_free(zp); + return; + } + + /* + * Attempt to push any data in the page cache. If this fails + * we will get kicked out later in zfs_zinactive(). + */ + if (vn_has_cached_data(vp)) { + (void) pvn_vplist_dirty(vp, 0, zfs_putapage, B_INVAL|B_ASYNC, + cr); + } + + if (zp->z_atime_dirty && zp->z_unlinked == 0) { + dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); + + dmu_tx_hold_bonus(tx, zp->z_id); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + } else { + dmu_buf_will_dirty(zp->z_dbuf, tx); + mutex_enter(&zp->z_lock); + zp->z_atime_dirty = 0; + mutex_exit(&zp->z_lock); + dmu_tx_commit(tx); + } + } + + zfs_zinactive(zp); + rw_exit(&zfsvfs->z_teardown_inactive_lock); +} + +/* + * Bounds-check the seek operation. + * + * IN: vp - vnode seeking within + * ooff - old file offset + * noffp - pointer to new file offset + * ct - caller context + * + * RETURN: 0 if success + * EINVAL if new offset invalid + */ +/* ARGSUSED */ +static int +zfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, + caller_context_t *ct) +{ + if (vp->v_type == VDIR) + return (0); + return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0); +} + +/* + * Pre-filter the generic locking function to trap attempts to place + * a mandatory lock on a memory mapped file. + */ +static int +zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset, + flk_callback_t *flk_cbp, cred_t *cr, caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + /* + * We are following the UFS semantics with respect to mapcnt + * here: If we see that the file is mapped already, then we will + * return an error, but we don't worry about races between this + * function and zfs_map(). + */ + if (zp->z_mapcnt > 0 && MANDMODE((mode_t)zp->z_phys->zp_mode)) { + ZFS_EXIT(zfsvfs); + return (EAGAIN); + } + error = fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct); + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * If we can't find a page in the cache, we will create a new page + * and fill it with file data. For efficiency, we may try to fill + * multiple pages at once (klustering). + */ +static int +zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg, + caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw) +{ + znode_t *zp = VTOZ(vp); + page_t *pp, *cur_pp; + objset_t *os = zp->z_zfsvfs->z_os; + caddr_t va; + u_offset_t io_off, total; + uint64_t oid = zp->z_id; + size_t io_len; + uint64_t filesz; + int err; + + /* + * If we are only asking for a single page don't bother klustering. + */ + filesz = zp->z_phys->zp_size; /* get consistent copy of zp_size */ + if (off >= filesz) + return (EFAULT); + if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) { + io_off = off; + io_len = PAGESIZE; + pp = page_create_va(vp, io_off, io_len, PG_WAIT, seg, addr); + } else { + /* + * Try to fill a kluster of pages (a blocks worth). + */ + size_t klen; + u_offset_t koff; + + if (!ISP2(zp->z_blksz)) { + /* Only one block in the file. */ + klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE); + koff = 0; + } else { + /* + * It would be ideal to align our offset to the + * blocksize but doing so has resulted in some + * strange application crashes. For now, we + * leave the offset as is and only adjust the + * length if we are off the end of the file. + */ + koff = off; + klen = plsz; + } + ASSERT(koff <= filesz); + if (koff + klen > filesz) + klen = P2ROUNDUP(filesz, (uint64_t)PAGESIZE) - koff; + ASSERT3U(off, >=, koff); + ASSERT3U(off, <, koff + klen); + pp = pvn_read_kluster(vp, off, seg, addr, &io_off, + &io_len, koff, klen, 0); + } + if (pp == NULL) { + /* + * Some other thread entered the page before us. + * Return to zfs_getpage to retry the lookup. + */ + *pl = NULL; + return (0); + } + + /* + * Fill the pages in the kluster. + */ + cur_pp = pp; + for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) { + ASSERT3U(io_off, ==, cur_pp->p_offset); + va = ppmapin(cur_pp, PROT_READ | PROT_WRITE, (caddr_t)-1); + err = dmu_read(os, oid, io_off, PAGESIZE, va); + ppmapout(va); + if (err) { + /* On error, toss the entire kluster */ + pvn_read_done(pp, B_ERROR); + return (err); + } + cur_pp = cur_pp->p_next; + } +out: + /* + * Fill in the page list array from the kluster. If + * there are too many pages in the kluster, return + * as many pages as possible starting from the desired + * offset `off'. + * NOTE: the page list will always be null terminated. + */ + pvn_plist_init(pp, pl, plsz, off, io_len, rw); + + return (0); +} + +/* + * Return pointers to the pages for the file region [off, off + len] + * in the pl array. If plsz is greater than len, this function may + * also return page pointers from before or after the specified + * region (i.e. some region [off', off' + plsz]). These additional + * pages are only returned if they are already in the cache, or were + * created as part of a klustered read. + * + * IN: vp - vnode of file to get data from. + * off - position in file to get data from. + * len - amount of data to retrieve. + * plsz - length of provided page list. + * seg - segment to obtain pages for. + * addr - virtual address of fault. + * rw - mode of created pages. + * cr - credentials of caller. + * ct - caller context. + * + * OUT: protp - protection mode of created pages. + * pl - list of pages created. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * vp - atime updated + */ +/* ARGSUSED */ +static int +zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp, + page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, + enum seg_rw rw, cred_t *cr, caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + page_t *pp, **pl0 = pl; + int need_unlock = 0, err = 0; + offset_t orig_off; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if (protp) + *protp = PROT_ALL; + + /* no faultahead (for now) */ + if (pl == NULL) { + ZFS_EXIT(zfsvfs); + return (0); + } + + /* can't fault past EOF */ + if (off >= zp->z_phys->zp_size) { + ZFS_EXIT(zfsvfs); + return (EFAULT); + } + orig_off = off; + + /* + * If we already own the lock, then we must be page faulting + * in the middle of a write to this file (i.e., we are writing + * to this file using data from a mapped region of the file). + */ + if (rw_owner(&zp->z_map_lock) != curthread) { + rw_enter(&zp->z_map_lock, RW_WRITER); + need_unlock = TRUE; + } + + /* + * Loop through the requested range [off, off + len] looking + * for pages. If we don't find a page, we will need to create + * a new page and fill it with data from the file. + */ + while (len > 0) { + if (plsz < PAGESIZE) + break; + if (pp = page_lookup(vp, off, SE_SHARED)) { + *pl++ = pp; + off += PAGESIZE; + addr += PAGESIZE; + len -= PAGESIZE; + plsz -= PAGESIZE; + } else { + err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw); + if (err) + goto out; + /* + * klustering may have changed our region + * to be block aligned. + */ + if (((pp = *pl) != 0) && (off != pp->p_offset)) { + int delta = off - pp->p_offset; + len += delta; + off -= delta; + addr -= delta; + } + while (*pl) { + pl++; + off += PAGESIZE; + addr += PAGESIZE; + plsz -= PAGESIZE; + if (len > PAGESIZE) + len -= PAGESIZE; + else + len = 0; + } + } + } + + /* + * Fill out the page array with any pages already in the cache. + */ + while (plsz > 0) { + pp = page_lookup_nowait(vp, off, SE_SHARED); + if (pp == NULL) + break; + *pl++ = pp; + off += PAGESIZE; + plsz -= PAGESIZE; + } + + ZFS_ACCESSTIME_STAMP(zfsvfs, zp); +out: + /* + * We can't grab the range lock for the page as reader which would + * stop truncation as this leads to deadlock. So we need to recheck + * the file size. + */ + if (orig_off >= zp->z_phys->zp_size) + err = EFAULT; + if (err) { + /* + * Release any pages we have previously locked. + */ + while (pl > pl0) + page_unlock(*--pl); + } + + *pl = NULL; + + if (need_unlock) + rw_exit(&zp->z_map_lock); + + ZFS_EXIT(zfsvfs); + return (err); +} + +/* + * Request a memory map for a section of a file. This code interacts + * with common code and the VM system as follows: + * + * common code calls mmap(), which ends up in smmap_common() + * + * this calls VOP_MAP(), which takes you into (say) zfs + * + * zfs_map() calls as_map(), passing segvn_create() as the callback + * + * segvn_create() creates the new segment and calls VOP_ADDMAP() + * + * zfs_addmap() updates z_mapcnt + */ +/*ARGSUSED*/ +static int +zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, + size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, + caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + segvn_crargs_t vn_a; + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if ((prot & PROT_WRITE) && + (zp->z_phys->zp_flags & (ZFS_IMMUTABLE | ZFS_READONLY | + ZFS_APPENDONLY))) { + ZFS_EXIT(zfsvfs); + return (EPERM); + } + + if ((prot & (PROT_READ | PROT_EXEC)) && + (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED)) { + ZFS_EXIT(zfsvfs); + return (EACCES); + } + + if (vp->v_flag & VNOMAP) { + ZFS_EXIT(zfsvfs); + return (ENOSYS); + } + + if (off < 0 || len > MAXOFFSET_T - off) { + ZFS_EXIT(zfsvfs); + return (ENXIO); + } + + if (vp->v_type != VREG) { + ZFS_EXIT(zfsvfs); + return (ENODEV); + } + + /* + * If file is locked, disallow mapping. + */ + if (MANDMODE((mode_t)zp->z_phys->zp_mode) && vn_has_flocks(vp)) { + ZFS_EXIT(zfsvfs); + return (EAGAIN); + } + + as_rangelock(as); + error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags); + if (error != 0) { + as_rangeunlock(as); + ZFS_EXIT(zfsvfs); + return (error); + } + + vn_a.vp = vp; + vn_a.offset = (u_offset_t)off; + vn_a.type = flags & MAP_TYPE; + vn_a.prot = prot; + vn_a.maxprot = maxprot; + vn_a.cred = cr; + vn_a.amp = NULL; + vn_a.flags = flags & ~MAP_TYPE; + vn_a.szc = 0; + vn_a.lgrp_mem_policy_flags = 0; + + error = as_map(as, *addrp, len, segvn_create, &vn_a); + + as_rangeunlock(as); + ZFS_EXIT(zfsvfs); + return (error); +} + +/* ARGSUSED */ +static int +zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, + size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, + caller_context_t *ct) +{ + uint64_t pages = btopr(len); + + atomic_add_64(&VTOZ(vp)->z_mapcnt, pages); + return (0); +} + +/* + * The reason we push dirty pages as part of zfs_delmap() is so that we get a + * more accurate mtime for the associated file. Since we don't have a way of + * detecting when the data was actually modified, we have to resort to + * heuristics. If an explicit msync() is done, then we mark the mtime when the + * last page is pushed. The problem occurs when the msync() call is omitted, + * which by far the most common case: + * + * open() + * mmap() + * + * munmap() + * close() + *